In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

In [7]:
df = pd.read_csv('Task 02/merged_raw_dataset.csv')

In [8]:


def clean_and_process_data(df):
    """
    Cleans and processes merged raw data for machine learning.

    Parameters:
        df (pd.DataFrame): Raw merged data.

    Returns:
        pd.DataFrame: Cleaned and processed data ready for modeling.
    """

    df = df.copy()

    # Drop duplicate rows
    df = df.drop_duplicates()

    # Handle missing values (numerical and categorical separately)
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')

    df[num_cols] = imputer_num.fit_transform(df[num_cols])
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

    # Encode categorical features
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded_cat = encoder.fit_transform(df[cat_cols])
    encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(cat_cols), index=df.index)

    df = df.drop(columns=cat_cols)
    df = pd.concat([df, encoded_cat_df], axis=1)

    # Feature scaling for numerical features
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df


In [9]:
df

Unnamed: 0,CustomerID,Age,Gender,MaritalStatus,IncomeLevel,TransactionID,TransactionDate,AmountSpent,ProductCategory,InteractionID,InteractionDate,InteractionType,ResolutionStatus,LastLoginDate,LoginFrequency,ServiceUsage,ChurnStatus
0,1,62,M,Single,Low,7194,2022-03-27,416.50,Electronics,6363.0,2022-03-31,Inquiry,Resolved,2023-10-21,34,Mobile App,0
1,2,65,M,Married,Low,7250,2022-08-08,54.96,Clothing,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
2,2,65,M,Married,Low,9660,2022-07-25,197.50,Electronics,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
3,2,65,M,Married,Low,2998,2022-01-25,101.31,Furniture,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
4,2,65,M,Married,Low,1228,2022-07-24,397.37,Clothing,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807,1000,34,M,Widowed,Low,2724,2022-09-08,232.06,Groceries,,,,,2023-08-13,22,Mobile App,0
6808,1000,34,M,Widowed,Low,2917,2022-12-13,324.98,Books,,,,,2023-08-13,22,Mobile App,0
6809,1000,34,M,Widowed,Low,2979,2022-06-15,375.34,Groceries,,,,,2023-08-13,22,Mobile App,0
6810,1000,34,M,Widowed,Low,8594,2022-04-08,166.73,Books,,,,,2023-08-13,22,Mobile App,0


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Step 1: Load and inspect data
df1= pd.read_csv('Task 02/merged_raw_dataset.csv')


In [11]:
df1

Unnamed: 0,CustomerID,Age,Gender,MaritalStatus,IncomeLevel,TransactionID,TransactionDate,AmountSpent,ProductCategory,InteractionID,InteractionDate,InteractionType,ResolutionStatus,LastLoginDate,LoginFrequency,ServiceUsage,ChurnStatus
0,1,62,M,Single,Low,7194,2022-03-27,416.50,Electronics,6363.0,2022-03-31,Inquiry,Resolved,2023-10-21,34,Mobile App,0
1,2,65,M,Married,Low,7250,2022-08-08,54.96,Clothing,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
2,2,65,M,Married,Low,9660,2022-07-25,197.50,Electronics,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
3,2,65,M,Married,Low,2998,2022-01-25,101.31,Furniture,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
4,2,65,M,Married,Low,1228,2022-07-24,397.37,Clothing,3329.0,2022-03-17,Inquiry,Resolved,2023-12-05,5,Website,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807,1000,34,M,Widowed,Low,2724,2022-09-08,232.06,Groceries,,,,,2023-08-13,22,Mobile App,0
6808,1000,34,M,Widowed,Low,2917,2022-12-13,324.98,Books,,,,,2023-08-13,22,Mobile App,0
6809,1000,34,M,Widowed,Low,2979,2022-06-15,375.34,Groceries,,,,,2023-08-13,22,Mobile App,0
6810,1000,34,M,Widowed,Low,8594,2022-04-08,166.73,Books,,,,,2023-08-13,22,Mobile App,0


In [12]:
# Step 2: Remove duplicates
df1 = df1.drop_duplicates()


In [13]:
# Step 3: Handle missing values
df1.fillna({'Gender': 'Unknown', 'MaritalStatus': 'Unknown', 'IncomeLevel': 'Unknown'}, inplace=True)
df1['AmountSpent'].fillna(0, inplace=True)
# Convert dates
for date_col in ['TransactionDate', 'InteractionDate', 'LastLoginDate']:
    df1[date_col] = pd.to_datetime(df1[date_col], errors='coerce')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['AmountSpent'].fillna(0, inplace=True)


In [14]:
print(df1.columns)

Index(['CustomerID', 'Age', 'Gender', 'MaritalStatus', 'IncomeLevel',
       'TransactionID', 'TransactionDate', 'AmountSpent', 'ProductCategory',
       'InteractionID', 'InteractionDate', 'InteractionType',
       'ResolutionStatus', 'LastLoginDate', 'LoginFrequency', 'ServiceUsage',
       'ChurnStatus'],
      dtype='object')


In [16]:
# Step 4: Encode categoricals
df1 = pd.get_dummies(df1, columns=['Gender', 'MaritalStatus', 'IncomeLevel', 'ProductCategory', 'InteractionType', 'ResolutionStatus', 'ServiceUsage'])


In [20]:
# Step 5: Aggregate per customer (example)
customer_features = df1.groupby('CustomerID').agg({
    'Age': 'first',
    'LoginFrequency': 'mean',  # example
    'AmountSpent': ['sum', 'mean', 'count'],
    'TransactionDate': 'max',
    # add more aggregations as needed
}).reset_index()
customer_features.columns = [' '.join(col).strip() for col in customer_features.columns.values]  # Flatten cols

# Merge with churn label (assuming it's available in your data)
labeled_df = customer_features.merge(df1[['CustomerID', 'ChurnStatus']].drop_duplicates(), on='CustomerID', how='left')

# Step 6: Scale numerical features
scaler = StandardScaler()
num_cols = ['Agefirst', 'LoginFrequency mean', 'AmountSpent sum', 'AmountSpent mean']  # etc.
labeled_df[num_cols] = scaler.fit_transform(labeled_df[num_cols])


KeyError: "['Agefirst'] not in index"

In [18]:
# Step 7: Save or proceed to modeling
labeled_df.to_csv('data_cleaned_processed_for_ml.csv', index=False)


In [19]:
labeled_df

Unnamed: 0,CustomerID,Age first,LoginFrequency mean,AmountSpent sum,AmountSpent mean,AmountSpent count,TransactionDate max,ChurnStatus
0,1,62,34.0,416.50,416.500000,1,2022-03-27,0
1,2,65,5.0,1547.42,221.060000,7,2022-11-19,1
2,3,18,3.0,1702.98,283.830000,6,2022-10-08,0
3,4,21,2.0,1834.58,183.458000,10,2022-12-27,0
4,5,21,41.0,2001.49,250.186250,8,2022-12-21,0
...,...,...,...,...,...,...,...,...
995,996,54,38.0,227.25,227.250000,1,2022-07-24,0
996,997,19,5.0,419.82,209.910000,2,2022-10-25,0
997,998,47,47.0,252.15,252.150000,1,2022-09-18,0
998,999,23,23.0,2393.26,265.917778,9,2022-12-07,0


In [27]:

df = pd.read_csv("data_cleaned_processed_for_ml.csv")
print(df.shape)
print(df.head())


(1000, 8)
   CustomerID  Age first  LoginFrequency mean  AmountSpent sum  \
0           1         62                 34.0           416.50   
1           2         65                  5.0          1547.42   
2           3         18                  3.0          1702.98   
3           4         21                  2.0          1834.58   
4           5         21                 41.0          2001.49   

   AmountSpent mean  AmountSpent count TransactionDate max  ChurnStatus  
0         416.50000                  1          2022-03-27            0  
1         221.06000                  7          2022-11-19            1  
2         283.83000                  6          2022-10-08            0  
3         183.45800                 10          2022-12-27            0  
4         250.18625                  8          2022-12-21            0  


In [28]:
from sklearn.model_selection import train_test_split

X = df.drop("ChurnStatus", axis=1)  # replace "Churn" with your actual target column
y = df["ChurnStatus"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
df

In [None]:

# ----- Load data -----
df = pd.read_csv("data_cleaned_processed_for_ml.csv")
X = df.drop("ChurnStatus", axis=1)   # Change 'target' to your actual target column
y = df["ChurnStatus"]

# ----- Convert dates to ordinal -----
for col in X.columns:
    if np.issubdtype(X[col].dtype, np.datetime64):
        X[col] = pd.to_datetime(X[col]).map(pd.Timestamp.toordinal)
    elif X[col].dtype == object:
        try:
            X[col] = pd.to_datetime(X[col]).map(pd.Timestamp.toordinal)
        except:
            pass

# ----- Split train/test -----
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----- Identify categorical/numeric -----
categorical_features = X_train.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(exclude=['object']).columns

# ----- Preprocessing -----
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

# ----- Random Forest Random Search -----
rf = RandomForestClassifier(random_state=42)
rf_param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_search = RandomizedSearchCV(
    rf, rf_param_dist, n_iter=20, cv=5, scoring='f1', random_state=42, n_jobs=-1
)
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', rf_search)])
rf_pipeline.fit(X_train, y_train)

# ----- Logistic Regression Random Search -----
lr = LogisticRegression(max_iter=1000, solver='liblinear')
lr_param_dist = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2']
}
lr_search = RandomizedSearchCV(
    lr, lr_param_dist, n_iter=20, cv=5, scoring='f1', random_state=42, n_jobs=-1
)
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', lr_search)])
lr_pipeline.fit(X_train, y_train)

# ----- Evaluate -----
rf_pred = rf_pipeline.predict(X_test)
lr_pred = lr_pipeline.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       150
           1       0.00      0.00      0.00        50

    accuracy                           0.75       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.56      0.75      0.64       200

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       150
           1       0.00      0.00      0.00        50

    accuracy                           0.75       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.56      0.75      0.64       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayeSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# -------------------
# 1. Preprocessing
# -------------------

# Example: convert date columns to numeric
for col in X_train.columns:
    if np.issubdtype(X_train[col].dtype, np.datetime64):
        X_train[col] = X_train[col].view('int64') // 10**9
    elif X_train[col].dtype == 'object':
        try:
            X_train[col] = pd.to_datetime(X_train[col]).view('int64') // 10**9
        except:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))

# Repeat for X_test
for col in X_test.columns:
    if np.issubdtype(X_test[col].dtype, np.datetime64):
        X_test[col] = X_test[col].view('int64') // 10**9
    elif X_test[col].dtype == 'object':
        try:
            X_test[col] = pd.to_datetime(X_test[col]).view('int64') // 10**9
        except:
            le = LabelEncoder()
            X_test[col] = le.fit_transform(X_test[col].astype(str))

# -------------------
# 2. Bayesian Search - Random Forest
# -------------------
rf = RandomForestClassifier(random_state=42)

rf_search = BayesSearchCV(
    rf,
    {
        'n_estimators': Integer(100, 300),
        'max_depth': Integer(5, 30),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 5)
    },
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_

# -------------------
# 3. Bayesian Search - Logistic Regression
# -------------------
log_reg = LogisticRegression(max_iter=1000, random_state=42)

log_reg_search = BayesSearchCV(
    log_reg,
    {
        'C': Real(1e-3, 10, prior='log-uniform'),
        'penalty': Categorical(['l1', 'l2']),
        'solver': Categorical(['liblinear', 'saga'])
    },
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)

log_reg_search.fit(X_train, y_train)
log_reg_best = log_reg_search.best_estimator_

# -------------------
# 4. Evaluation
# -------------------
models = {
    "Random Forest": rf_best,
    "Logistic Regression": log_reg_best
}

results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

metrics_df = pd.DataFrame(results)
print(metrics_df)


ImportError: cannot import name 'BayeSearchCV' from 'skopt' (d:\Anaconda\Lib\site-packages\skopt\__init__.py)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


ValueError: 
All the 180 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anaconda\Lib\site-packages\sklearn\ensemble\_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '2022-12-03'

--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anaconda\Lib\site-packages\sklearn\ensemble\_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '2022-12-14'


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))

# ROC Curve
RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.show()
