# Installing Libraries

In [None]:
pip install pandas scikit-learn tpot

# Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tpot import TPOTClassifier

# Data Processing

In [None]:
url="/content/drive/MyDrive/Auto AI/data.csv"
df=pd.read_csv(url)

In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes




The data set includes information about:


*   Customers who left within the last month – the column is called Churn.

*   Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies.

*   Customer account information - how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges.

*   Demographic info about customers – gender, age range, and if they have partners and dependents






In [None]:
df.shape

(7043, 21)

In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [None]:
df=df.drop('customerID',axis=1)
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df.fillna(df["TotalCharges"].mean())

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [None]:
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**Converting objects to integer**

In [None]:
from sklearn.preprocessing import LabelEncoder

def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [None]:
df = df.apply(lambda x: object_to_int(x))
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


**Splitting data into x(features) and Y(label) and then creating train and test data**

In [None]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)
# stratified sampling can be useful when dealing with imbalanced datasets
# to ensure that the training and test datasets have the same proportion of class labels as the input dataset.

# Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

def feature_selection(X_train, y_train):
    # Using RandomForestClassifier for feature selection
    selector = SelectFromModel(RandomForestClassifier())
    selector.fit(X_train, y_train)

    # I have used boolean mask of selected features to create a mask
    selected_features_mask = selector.get_support()

    # Now here I have applied the mask to extract the selected features names
    selected_feature_names = X_train.columns[selected_features_mask]

    # Transform the input data to keep only the selected features
    X_train_selected = selector.transform(X_train)

    return X_train_selected, selected_feature_names

In [None]:
X_train_selected,selected_feature_names=feature_selection(X_train,y_train)
print("Selected Feature Names:", selected_feature_names)

Selected Feature Names: Index(['tenure', 'Contract', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges'],
      dtype='object')


# Model Selection and Tuning

In [None]:
def model_selection_and_tuning(X_train, y_train):
    # Using TPOT for automated model selection and hyperparameter tuning
    tpot = TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42, scoring='accuracy', cv=5)
    tpot.fit(X_train, y_train)
    best_model = tpot.fitted_pipeline_
    return best_model

In [None]:
best_model = model_selection_and_tuning(X_train_selected, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7956109941810079

Generation 2 - Current best internal CV score: 0.797033552061409

Generation 3 - Current best internal CV score: 0.7974400561264496

Generation 4 - Current best internal CV score: 0.7974400561264496

Generation 5 - Current best internal CV score: 0.7974400561264496

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=14, min_samples_split=18, n_estimators=100)


In [None]:
# We will save the model. Here I am saving it in my drive
# For saving the model I have used joblib library
import joblib

save_model_path="/content/drive/MyDrive/Auto AI/best_model.joblib"

joblib.dump(best_model, save_model_path)
print(f"Best model saved to {save_model_path}")

Best model saved to /content/drive/MyDrive/Auto AI/best_model.joblib


# Evaluating the model

In [None]:
loaded_model = joblib.load(save_model_path)
# Checking the saved model by reloading it

In [None]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
# Since we have applied feature selection to our X_train data. We have to filter the columns from any test data that we will use for evaluation
# We will now write a function to filter the same
def filter_features_for_test_data(test_data):
    test_data=test_data[selected_feature_names]
    return test_data

test_data=filter_features_for_test_data(X_test)
test_data

Unnamed: 0,tenure,Contract,PaymentMethod,MonthlyCharges,TotalCharges
5710,70,2,2,110.50,7752.05
2513,52,2,3,19.20,1054.75
1078,59,2,1,79.20,4590.35
5711,20,0,2,90.80,1951.00
6574,9,0,2,90.10,816.80
...,...,...,...,...,...
2115,71,2,2,118.65,8477.60
1290,72,2,1,84.45,5899.85
2095,1,0,2,79.95,79.95
2139,7,0,2,66.85,458.10


In [None]:
accuracy = evaluate_model(best_model, test_data, y_test)
print(f"Final Accuracy: {accuracy}")

Final Accuracy: 0.8028436018957346




**So as we can see the accuracy of the model is about 80%**