<h1>Bank Customer Churn Analysis</h1>

<h4>Importing the dataset</h4>

In [None]:
import pandas as pd

df = pd.read_csv('../data/Customer-Churn-Records.csv')

df.head()

<h6>Checking for missing values & preparing dataset</h6>

In [None]:
df.isnull().sum()   # check for missing values

#df.describe()   # summary statistics

In [43]:
df = df.dropna()   # drop missing values

##or fill missing values with mean
#df = df.fillna(df.mean())

<h5>Revoming unnecessary columns</h5>

In [None]:
from sklearn.preprocessing import StandardScaler

# Load dataset (assuming df is already loaded)
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)  # Drop irrelevant columns

# Select numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns   

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform only numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Display the first few rows after scaling
print(df.head())


<h4>Split the dataset</h4>

In [None]:
#split the dataset

from sklearn.model_selection import train_test_split

# Define Features (X) and Target (y)
X = df.drop('Exited', axis=1)   #drop the target column exited
y = df['Exited']        #target column

# Split the dataset into training and testing sets (80% training, 20% testing) 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)   

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


<h4>Train Classification Using Different Models</h4>

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
#desicion tree
from sklearn.tree import DecisionTreeClassifier 
#random forest
from sklearn.ensemble import RandomForestClassifier
#support vector machine
from sklearn.svm import SVC

#Train models using model.fit(X_train, y_train)

# Initialize the models
log_reg = LogisticRegression()
dec_tree = DecisionTreeClassifier()
ran_for = RandomForestClassifier()
svm = SVC()

# Train the models
log_reg.fit(x_train, y_train)
dec_tree.fit(x_train, y_train)
ran_for.fit(x_train, y_train)
svm.fit(x_train, y_train)

# Predict the target on the training and testing sets
y_train_pred_log_reg = log_reg.predict(x_train)
y_test_pred_log_reg = log_reg.predict(x_test)

y_train_pred_dec_tree = dec_tree.predict(x_train)
y_test_pred_dec_tree = dec_tree.predict(x_test)

y_train_pred_ran_for = ran_for.predict(x_train)
y_test_pred_ran_for = ran_for.predict(x_test)

y_train_pred_svm = svm.predict(x_train)
y_test_pred_svm = svm.predict(x_test)

# Evaluate the models using accuracy_score
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the models
accuracy_log_reg_train = accuracy_score(y_train, y_train_pred_log_reg)
accuracy_log_reg_test = accuracy_score(y_test, y_test_pred_log_reg)

accuracy_dec_tree_train = accuracy_score(y_train, y_train_pred_dec_tree)
accuracy_dec_tree_test = accuracy_score(y_test, y_test_pred_dec_tree)

accuracy_ran_for_train = accuracy_score(y_train, y_train_pred_ran_for)
accuracy_ran_for_test = accuracy_score(y_test, y_test_pred_ran_for)

accuracy_svm_train = accuracy_score(y_train, y_train_pred_svm)
accuracy_svm_test = accuracy_score(y_test, y_test_pred_svm)

# Print the accuracies
print('Logistic Regression')
print('Training accuracy: ', accuracy_log_reg_train)
print('Testing accuracy: ', accuracy_log_reg_test)
print('\n')

print('Decision Tree')
print('Training accuracy: ', accuracy_dec_tree_train)
print('Testing accuracy: ', accuracy_dec_tree_test)
print('\n')

print('Random Forest')
print('Training accuracy: ', accuracy_ran_for_train)
print('Testing accuracy: ', accuracy_ran_for_test)
print('\n')

print('Support Vector Machine')
print('Training accuracy: ', accuracy_svm_train)
print('Testing accuracy: ', accuracy_svm_test)
print('\n')

