<a href="https://colab.research.google.com/github/RadwaFathi/MachineLearningProject2/blob/main/MLProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, f1_score
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import numpy as np


df = pd.read_csv("/content/train.csv")
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15585961.0,Hs?,684.0,France,Male,41.0,10.0,0.0,2.0,1.0,1.0,173948.4,1.0
1,1,15643378.0,Bellucci,807.0,France,Male,32.0,2.0,0.0,2.0,1.0,0.0,144532.85,0.0
2,2,15651022.0,O'Donnell,553.0,Germany,Male,53.0,9.0,102278.52,1.0,1.0,0.0,158816.03,1.0
3,3,15676521.0,Chiang,587.0,France,Female,34.0,6.0,0.0,1.0,1.0,0.0,167984.72,1.0
4,4,15772650.0,Kambinachi,732.0,Germany,Female,30.0,5.0,135070.92,1.0,1.0,1.0,116097.26,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               15000 non-null  int64  
 1   CustomerId       15000 non-null  float64
 2   Surname          15000 non-null  object 
 3   CreditScore      15000 non-null  float64
 4   Geography        15000 non-null  object 
 5   Gender           15000 non-null  object 
 6   Age              15000 non-null  float64
 7   Tenure           15000 non-null  float64
 8   Balance          15000 non-null  float64
 9   NumOfProducts    15000 non-null  float64
 10  HasCrCard        15000 non-null  float64
 11  IsActiveMember   15000 non-null  float64
 12  EstimatedSalary  15000 non-null  float64
 13  Exited           15000 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 1.6+ MB


In [None]:
value_counts = df['Balance'].value_counts()
print(value_counts)

Balance
0.00         9815
122314.50      17
122453.97      14
124577.33      13
126473.33      11
             ... 
121375.39       1
104817.37       1
162657.64       1
123356.63       1
131354.43       1
Name: count, Length: 3360, dtype: int64


In [None]:
#Drop unnecessary columns
data = df.drop(columns=["id", "CustomerId", "Surname"])

X = data.drop(columns=["Exited"])
y = data["Exited"]

In [None]:
# Identify categorical and numerical columns
categorical_columns = ["Geography", "Gender"]
numerical_columns = X.select_dtypes(include=["float64"]).columns.tolist()
# Remove categorical columns from numerical list
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]

In [None]:
#Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(drop="first"), categorical_columns),
    ]
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [None]:
# models
models = {
    "Logistic Regression": LogisticRegression(
    random_state=42,
    max_iter=1000,       # Maximum number of iterations for solver convergence
    C=1.0,               # Regularization strength; smaller values specify stronger regularization
    penalty='l2',        # L2 regularization (Ridge), helps prevent overfitting by penalizing large coefficients
    class_weight='balanced',  # Adjusts weights inversely proportional to class frequencies
    intercept_scaling=1, # Used for adding an intercept term (needed when using 'liblinear' solver)
),
    "Decision Tree": DecisionTreeClassifier(
    random_state=42,
    criterion='entropy',      # Measure the quality of a split using entropy (alternatively 'gini')
    max_depth=10,             # Maximum depth of the tree, limiting it prevents overfitting
    min_samples_split=20,     # Minimum number of samples required to split an internal node
    min_samples_leaf=10,      # Minimum number of samples required to be at a leaf node (prevents too many small leaves)
    max_features='sqrt',      # Consider only sqrt(n_features) features for each split to reduce overfitting
    class_weight='balanced',  # Adjust weights inversely proportional to class frequencies to deal with class imbalance
),
    # "Random Forest": RandomForestClassifier(random_state=42, n_estimators=1000),
    "Random Forest": RandomForestClassifier(
    random_state=42,
    n_estimators=5000,  # Number of trees in the forest
    class_weight = 'balanced',
    # max_leaf_nodes=100,  # Maximum number of leaf nodes in each tree
    max_depth=None,  # Maximum depth of the tree (None means the nodes are expanded until all leaves are pure)
    min_samples_split=20,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_features='sqrt',  # Number of features to consider for the best split (auto uses sqrt(n_features))
    bootstrap=True,  # Whether to use bootstrap sampling
    criterion='entropy',  # The function to measure the quality of a split (can also be 'entropy')
    max_samples=0.7,  # The number of samples to draw to train each base estimator (None means use all samples)
    oob_score=False,  # Whether to use out-of-bag samples to estimate the generalization error
    n_jobs=-1  # The number of jobs to run in parallel (set to -1 to use all available cores)
),
        "LightGBM": lgb.LGBMClassifier(random_state=42, class_weight='balanced')

}


Logistic Regression:


*   C-> smaller C adds more regularization to prevent overfitting
*   l2-> penalizing large coefficients, preventing overfitting
*   balanced -> data might be imbalanced , adjusts weights for each class inversely proportional to the class frequencies






Decision Tree:


*   entropy-> uses information gain, which is more sensitive to the data's features
*   max_depth 10 -> balance between model complexity and bias
*   Min Samples Split 20-> ensures that nodes are split only when there are enough samples
*   Min Samples Leaf 10-> Setting a higher value for leaf nodes helps in generalizing the model by reducing variance
*   sqrt-> controlling overfitting by considering a random subset of features for splitting, rather than using all features





In [None]:
# Tune LightGBM model using GridSearchCV
lgb_param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
}

lgb_grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(random_state=42),
                               param_grid=lgb_param_grid,
                               scoring='roc_auc',
                               cv=3,
                               verbose=1,
                               n_jobs=-1)

# Fit GridSearch for LightGBM
lgb_grid_search.fit(X_train_processed, y_train)
best_lgb_model = lgb_grid_search.best_estimator_

print("Best Parameters:", lgb_grid_search.best_params_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 2111, number of negative: 8389
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 10500, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.201048 -> initscore=-1.379760
[LightGBM] [Info] Start training from score -1.379760
Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500, 'num_leaves': 31}


In [None]:

# Train and evaluate models
roc_auc_scores = {}
f1_scores = {}

for model_name, model in models.items():
    if model_name == "LightGBM":
        model = best_lgb_model

    # Train the model
    model.fit(X_train_processed, y_train)

    # Predict probabilities for ROC AUC
    y_val_probs = model.predict_proba(X_val_processed)[:, 1]

    # Predict class labels for F1 score
    y_val_preds = model.predict(X_val_processed)

    # Calculate ROC AUC score
    roc_auc_scores[model_name] = roc_auc_score(y_val, y_val_probs)

    # Calculate F1 score
    f1_scores[model_name] = f1_score(y_val, y_val_preds)



[LightGBM] [Info] Number of positive: 2111, number of negative: 8389
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 10500, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.201048 -> initscore=-1.379760
[LightGBM] [Info] Start training from score -1.379760


In [None]:
# ROC AUC and F1 scores
print("ROC AUC Scores:")
for model_name, score in roc_auc_scores.items():
    print(f"{model_name}: {score:.4f}")

print("\nF1 Scores:")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")
#0.9291
#0.9329

ROC AUC Scores:
Logistic Regression: 0.8780
Decision Tree: 0.8593
Random Forest: 0.9291
LightGBM: 0.9329

F1 Scores:
Logistic Regression: 0.6450
Decision Tree: 0.6159
Random Forest: 0.7291
LightGBM: 0.7257


In [None]:
import pandas as pd
import lightgbm as lgb

# Load the test dataset
test_file_path = '/content/test.csv'  # Replace with your actual test file path
test_data = pd.read_csv(test_file_path)

# Preprocess the test data
# Assuming the preprocessing steps are already defined as 'preprocessor' in the training code
X_test = test_data.drop(columns=["id", "CustomerId", "Surname"])
X_test_processed = preprocessor.transform(X_test)

# Use the trained and tuned LightGBM model to predict churn probabilities
# The best model was found during GridSearchCV in the previous training step
y_test_probs = best_lgb_model.predict_proba(X_test_processed)[:, 1]

# Create a DataFrame with the predictions
predictions = pd.DataFrame({
    "id": test_data["id"],
    "churn_probability": y_test_probs
})

# Save the predictions to a CSV file
output_file_path = 'lightgbm_predictions.csv'  # Replace with your desired output path
predictions.to_csv(output_file_path, index=False)

print(f"Predictions saved to: {output_file_path}")


Predictions saved to: lightgbm_predictions.csv


In [None]:
# import pandas as pd

# # Load the test dataset
# test_file_path = '/content/test.csv'  # Replace with your actual test file path
# test_data = pd.read_csv(test_file_path)

# # Preprocess the test data
# X_test = test_data.drop(columns=["id", "CustomerId", "Surname"])
# X_test_processed = preprocessor.transform(X_test)

# # Use the trained Random Forest model to predict churn probabilities
# random_forest = models["Random Forest"]
# y_test_probs = random_forest.predict_proba(X_test_processed)[:, 1]

# # Create a DataFrame with the predictions
# predictions = pd.DataFrame({
#     "id": test_data["id"],
#     "churn_probability": y_test_probs
# })

# # Save the predictions to a CSV file
# output_file_path = 'random_forest_predictions.csv'  # Replace with your desired output path
# predictions.to_csv(output_file_path, index=False)

# print(f"Predictions saved to: {output_file_path}")
