In [11]:
import gdown
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [13]:
# Google Drive file ID from the shared link
file_id = "1hE5e7Gaq3VEmQXgu2aJdvyqE4xliCttX"

# Download the file from Google Drive
url = f"https://drive.google.com/uc?id={file_id}"
output = "Churn_Modelling.csv"
gdown.download(url, output, quiet=False)

# Load CSV data from downloaded file
data = pd.read_csv(output)

Downloading...
From: https://drive.google.com/uc?id=1hE5e7Gaq3VEmQXgu2aJdvyqE4xliCttX
To: C:\Users\shree\Churn_Modelling.csv
100%|███████████████████████████████████████████████████████████████████████████████| 685k/685k [00:00<00:00, 7.60MB/s]


In [14]:
# Handle missing values and data cleaning if needed
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
data.dropna(inplace=True)

# Encode categorical variables
encoder = OneHotEncoder(drop='first')
data_encoded = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)


In [15]:
# Split data into features and target
X = data_encoded.drop('Exited', axis=1)
y = data_encoded['Exited']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [16]:
# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
# Initialize models
log_reg = LogisticRegression()
rf = RandomForestClassifier()

In [18]:
# Train models
log_reg.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)

In [9]:
# Predictions
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

In [10]:
# Evaluate models
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

log_reg_scores = evaluate_model(y_test, y_pred_log_reg)
rf_scores = evaluate_model(y_test, y_pred_rf)

print("Logistic Regression Scores:", log_reg_scores)
print("Random Forest Scores:", rf_scores)

Logistic Regression Scores: (0.811, 0.5524475524475524, 0.2010178117048346, 0.2947761194029851)
Random Forest Scores: (0.867, 0.7591836734693878, 0.4732824427480916, 0.5830721003134797)
