## 1. Data Loading & Inspection

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Cus.ID,Date,Cus. Location,Age,Gender,Mobile Name,Sell Price,Does he/she Come from Facebook Page?,Does he/she Followed Our Page?,Did he/she buy any mobile before?,Did he/she hear of our shop before?
0,1,27-05-2024,Rangamati Sadar,49,F,Galaxy A55 5G 8/128,17073.0,No,Yes,No,Yes
1,2,27-05-2024,Inside Rangamati,44,M,Redmi Note 12 Pro 8/128,15546.0,Yes,No,No,Yes
2,3,27-05-2024,Rangamati Sadar,45,M,R-70 Turbo 5G 6/128,26516.0,Yes,No,No,Yes
3,4,27-05-2024,Rangamati Sadar,46,M,R-70 Turbo 5G 6/128,21927.0,No,No,No,Yes
4,5,27-05-2024,Outside Rangamati,27,F,Vivo T3x 5G 8/128,16718.0,Yes,No,No,Yes


## 2. Exploratory Data Analysis (EDA)

In [2]:
df.info()
df.describe(include='all')
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Cus.ID                                8871 non-null   int64  
 1   Date                                  8871 non-null   object 
 2   Cus. Location                         8871 non-null   object 
 3   Age                                   8871 non-null   int64  
 4   Gender                                8871 non-null   object 
 5   Mobile Name                           8871 non-null   object 
 6   Sell Price                            8871 non-null   float64
 7   Does he/she Come from Facebook Page?  8871 non-null   object 
 8   Does he/she Followed Our Page?        8871 non-null   object 
 9   Did he/she buy any mobile before?     8871 non-null   object 
 10  Did he/she hear of our shop before?   8871 non-null   object 
dtypes: float64(1), in

Cus.ID                                  0
Date                                    0
Cus. Location                           0
Age                                     0
Gender                                  0
Mobile Name                             0
Sell Price                              0
Does he/she Come from Facebook Page?    0
Does he/she Followed Our Page?          0
Did he/she buy any mobile before?       0
Did he/she hear of our shop before?     0
dtype: int64

## 3. Feature Engineering

In [3]:
# Feature combinations
df['social_engagement'] = df['came_from_facebook'] + df['follows_facebook_page']
df['brand_by_location'] = df['mobile_name'] + '_' + df['customer_location']
df['gender_social_source'] = df['gender'] + '_' + df['came_from_facebook'].astype(str)
df['fb_heard_combo'] = df['follows_facebook_page'] * df['came_from_facebook']

KeyError: 'came_from_facebook'

## 4. Encoding Categorical Features

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['mobile_name', 'customer_location', 'heard_about_shop', 'gender', 'brand_by_location', 'gender_social_source']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

## 5. Mutual Information / Feature Importance

In [None]:
from sklearn.feature_selection import mutual_info_classif

X = df.drop('returning_customer', axis=1)
y = df['returning_customer']
mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
mi_series

## 6. Model Building (Logistic Regression as Baseline)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_val)
print(classification_report(y_val, y_pred))

## 7. Cross-Validation & Model Comparison

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model_rf = RandomForestClassifier(random_state=1)
scores = cross_val_score(model_rf, X, y, cv=5, scoring='accuracy')
print('Random Forest CV Accuracy:', scores.mean())

## 8. Model Evaluation & Visualization

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_val)
y_prob_rf = model_rf.predict_proba(X_val)[:,1]
print(classification_report(y_val, y_pred_rf))
print('ROC AUC:', roc_auc_score(y_val, y_prob_rf))

sns.heatmap(confusion_matrix(y_val, y_pred_rf), annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

## 9. Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 10]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print('Best Params:', grid.best_params_)
print('Best Score:', grid.best_score_)

## 10. Save Final Model

In [None]:
import joblib
joblib.dump(grid.best_estimator_, 'final_model.pkl')