<a href="https://colab.research.google.com/github/Sans7349/CODESOFT/blob/My-tasks/TASK_3_CUSTOMER_CHURN_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
file_path = '/content/Churn_Modelling.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:\n", data.head())

# Display dataset information
print("\nDataset information:\n")
data.info()

# Check for missing values
print("\nMissing values:\n", data.isnull().sum())

# Display basic statistics
print("\nBasic statistics:\n", data.describe())

print("___________________________________________________________________________________________________________________________________________________")


# Data preprocessing
# Drop irrelevant columns
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Encode categorical variables
label_encoder_geography = LabelEncoder()
label_encoder_gender = LabelEncoder()

data['Geography'] = label_encoder_geography.fit_transform(data['Geography'])
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

# Define features and target
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model building
# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Random Forest
rand_forest = RandomForestClassifier(random_state=42)
rand_forest.fit(X_train, y_train)
y_pred_rand_forest = rand_forest.predict(X_test)

# Gradient Boosting
grad_boost = GradientBoostingClassifier(random_state=42)
grad_boost.fit(X_train, y_train)
y_pred_grad_boost = grad_boost.predict(X_test)

# Model evaluation
def evaluate_model(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

print("\nLogistic Regression:")
evaluate_model(y_test, y_pred_log_reg)
print("___________________________________________________________________________________________________________________________________________________")


print("\nRandom Forest:")
evaluate_model(y_test, y_pred_rand_forest)
print("___________________________________________________________________________________________________________________________________________________")


print("\nGradient Boosting:")
evaluate_model(y_test, y_pred_grad_boost)
print("___________________________________________________________________________________________________________________________________________________")


# Assuming Gradient Boosting performs best, use it for prediction
best_model = grad_boost

# Save the encoding mappings
geography_mapping = dict(zip(label_encoder_geography.classes_, label_encoder_geography.transform(label_encoder_geography.classes_)))
gender_mapping = dict(zip(label_encoder_gender.classes_, label_encoder_gender.transform(label_encoder_gender.classes_)))

# Predict churn for new data (example)
new_customer_data = pd.DataFrame({
    'CreditScore': [600],
    'Geography': [geography_mapping['France']],
    'Gender': [gender_mapping['Female']],
    'Age': [40],
    'Tenure': [5],
    'Balance': [60000],
    'NumOfProducts': [2],
    'HasCrCard': [1],
    'IsActiveMember': [1],
    'EstimatedSalary': [50000]
})

# Standardize the new customer data
new_customer_data = scaler.transform(new_customer_data)

# Predict churn
churn_prediction = best_model.predict(new_customer_data)
print("\nChurn prediction for the new customer:", "Yes" if churn_prediction[0] == 1 else "No")


First few rows of the dataset:
    RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         