In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [19]:
# Load dataset (adjust path accordingly)
df = pd.read_csv("Churn_Modelling.csv")

# Check for missing values and summary
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
None
         RowNumber    CustomerId   CreditScore           Age        Tenure  \
count  1000

In [20]:
# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

# Inspect the first few rows after encoding
print(df_encoded.head())

   RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  NumOfProducts  \
0          1    15634602          619   42       2       0.00              1   
1          2    15647311          608   41       1   83807.86              1   
2          3    15619304          502   42       8  159660.80              3   
3          4    15701354          699   39       1       0.00              2   
4          5    15737888          850   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  ...  Surname_Zotova  \
0          1               1        101348.88  ...           False   
1          0               1        112542.58  ...           False   
2          1               0        113931.57  ...           False   
3          0               0         93826.63  ...           False   
4          1               1         79084.10  ...           False   

   Surname_Zox  Surname_Zubarev  Surname_Zubareva  Surname_Zuev  \
0        False            False

In [21]:
# Separate features (X) and target (y)
X = df_encoded.drop(columns=['Exited'])
y = df_encoded['Exited']

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (8000, 2944)
X_test shape: (2000, 2944)


In [22]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Normalize the feature set for both training and test sets
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Normalized X_train shape: {X_train.shape}")
print(f"Normalized X_test shape: {X_test.shape}")

Normalized X_train shape: (8000, 2944)
Normalized X_test shape: (2000, 2944)


In [23]:
# Initialize Logistic Regression model
log_model = LogisticRegression()

# Train the model
log_model.fit(X_train, y_train)

# Predict the target values for the test set
y_pred_log = log_model.predict(X_test)

# Evaluate the Logistic Regression model
accuracy_log = accuracy_score(y_test, y_pred_log)
conf_matrix_log = confusion_matrix(y_test, y_pred_log)
classification_report_log = classification_report(y_test, y_pred_log)

print(f"Logistic Regression Accuracy: {accuracy_log}")
print(f"Logistic Regression Confusion Matrix:\n{conf_matrix_log}")
print(
    f"Logistic Regression Classification Report:\n{classification_report_log}")

Logistic Regression Accuracy: 0.781
Logistic Regression Confusion Matrix:
[[1476  131]
 [ 307   86]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1607
           1       0.40      0.22      0.28       393

    accuracy                           0.78      2000
   macro avg       0.61      0.57      0.58      2000
weighted avg       0.74      0.78      0.76      2000



In [24]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict the target values for the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Random Forest Confusion Matrix:\n{conf_matrix_rf}")
print(f"Random Forest Classification Report:\n{classification_report_rf}")

Random Forest Accuracy: 0.859
Random Forest Confusion Matrix:
[[1582   25]
 [ 257  136]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.98      0.92      1607
           1       0.84      0.35      0.49       393

    accuracy                           0.86      2000
   macro avg       0.85      0.67      0.70      2000
weighted avg       0.86      0.86      0.83      2000



In [10]:
print(df.dtypes)

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object


In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC

# Load the dataset
df = pd.read_csv("Churn_Modelling.csv")

# Drop 'Surname' column as it's not useful for prediction
df = df.drop('Surname', axis=1)

# One-Hot Encoding for 'Geography'
# drop_first to avoid multicollinearity
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

# Label Encoding for 'Gender'
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Define X and y
X = df.drop('Exited', axis=1)  # Features
y = df['Exited']  # Target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Normalize the feature set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Support Vector Machine
svm_model = SVC(random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model

accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
classification_report_svm = classification_report(y_test, y_pred_svm)

print(f"SVM Accuracy: {accuracy_svm}")
print(f"SVM Confusion Matrix:\n{conf_matrix_svm}")
print(f"SVM Classification Report:\n{classification_report_svm}")

SVM Accuracy: 0.8545
SVM Confusion Matrix:
[[1559   48]
 [ 243  150]]
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.91      1607
           1       0.76      0.38      0.51       393

    accuracy                           0.85      2000
   macro avg       0.81      0.68      0.71      2000
weighted avg       0.84      0.85      0.83      2000



In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define parameter grid for Random Forest Classifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV with Random Forest
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                              param_grid=param_grid,
                              cv=3,
                              verbose=2,
                              n_jobs=-1)

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Best parameters and score
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best Cross-Validation Score: {grid_search_rf.best_score_}")

# Predict with the best model
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluate the tuned model
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
conf_matrix_best_rf = confusion_matrix(y_test, y_pred_best_rf)
classification_report_best_rf = classification_report(y_test, y_pred_best_rf)

print(f"Tuned Random Forest Accuracy: {accuracy_best_rf}")
print(f"Tuned Random Forest Confusion Matrix:\n{conf_matrix_best_rf}")
print(
    f"Tuned Random Forest Classification Report:\n{classification_report_best_rf}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Score: 0.8646244331986633
Tuned Random Forest Accuracy: 0.8625
Tuned Random Forest Confusion Matrix:
[[1541   66]
 [ 209  184]]
Tuned Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000

