In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score


df = pd.read_csv("C:\\Users\\DELL\\Downloads\\WA_Fn-UseC_-Telco-Customer-Churn.csv")



In [2]:

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

# Convert 'Churn' to binary
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})


In [2]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Using cached xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [4]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/e1/4c/4685ccfae9806f561de716e32549190c1f533dde5bcadaf83bdf23972cf0/lightgbm-4.3.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
    --------------------------------------- 0.0/1.3 MB 1.4 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.3 MB 1.2 MB/s eta 0:00:02
   --- ------------------------------------ 0.1/1.3 MB 819.2 kB/s eta 0:00:02
   ---- ----------------------------------- 0.2/1.3 MB 1.1 MB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.3 MB 876.1 kB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.3 MB 787.7 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.3 MB 692.4 kB/s eta 0:00:02
   ------ --------------------------

In [3]:

X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [4]:

numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [5]:

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [6]:

numerical_columns = preprocessor.transformers_[0][1]['scaler'].get_feature_names_out(numerical_features)
categorical_columns = preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(categorical_features)

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=list(numerical_columns) + list(categorical_columns))
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=list(numerical_columns) + list(categorical_columns))


In [7]:
#Q14

numerical_columns = preprocessor.transformers_[0][1]['scaler'].get_feature_names_out(numerical_features)
categorical_columns = preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(categorical_features)

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=list(numerical_columns) + list(categorical_columns))
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=list(numerical_columns) + list(categorical_columns))


rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train_preprocessed, y_train)


y_pred = rf_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)

print(f'Random Forest Accuracy: {accuracy:.4f}')

Random Forest Accuracy: 0.7913


In [8]:
#Q15

xgb_model = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_preprocessed, y_train)


y_pred = xgb_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)

print(f'XGBoost Accuracy: {accuracy:.4f}')

XGBoost Accuracy: 0.7935


In [9]:
#Q16

lgbm_model = LGBMClassifier(random_state=1)
lgbm_model.fit(X_train_preprocessed, y_train)


y_pred = lgbm_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)

print(f'LightGBM Accuracy: {accuracy:.4f}')

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Accuracy: 0.8034


In [11]:
#Q17
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Initializing the Extra Trees Classifier
etc = ExtraTreesClassifier(random_state=1)

# Initializing RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=etc,
                                   param_distributions=hyperparameter_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=5,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=1)


random_search.fit(X_train_preprocessed, y_train)


print(f"Best hyperparameters: {random_search.best_params_}")


best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterErro

Best hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}
Test set accuracy: 0.8041


In [13]:
#Q18
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV


param_dist = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'random_state': [1]
}

# Initializing the ExtraTreesClassifier
initial_model = ExtraTreesClassifier(random_state=1)


random_search = RandomizedSearchCV(estimator=initial_model, param_distributions=param_dist, n_iter=50, cv=5, random_state=1, n_jobs=-1)
random_search.fit(X_train_preprocessed, y_train)


best_params = random_search.best_params_
print("Best hyperparameters from RandomizedSearchCV:", best_params)


best_model = ExtraTreesClassifier(**best_params)
best_model.fit(X_train_preprocessed, y_train)

# Predict on the test set
y_pred_best = best_model.predict(X_test_preprocessed)

# Evaluate accuracy of the new model
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy of the new optimal ExtraTreesClassifier model:", accuracy_best)

# Compare with the initial model (without hyperparameter tuning)
initial_model.fit(X_train_preprocessed, y_train)
y_pred_initial = initial_model.predict(X_test_preprocessed)
accuracy_initial = accuracy_score(y_test, y_pred_initial)
print("Accuracy of the initial ExtraTreesClassifier model:", accuracy_initial)


Best hyperparameters from RandomizedSearchCV: {'random_state': 1, 'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 30, 'criterion': 'entropy', 'bootstrap': True}
Accuracy of the new optimal ExtraTreesClassifier model: 0.8112136266855926
Accuracy of the initial ExtraTreesClassifier model: 0.7672107877927609
