In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
df = pd.read_csv('files/Leads.csv')
df.head()

In [None]:
print(df.head())
print(df.info())
print(df.describe(include='all'))

In [None]:
df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].str.split(".").str[0]
df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].str.split(".").str[0]

numerical_columns_with_missing = [
    'Asymmetrique Activity Index', 'Asymmetrique Profile Index',
    'Asymmetrique Activity Score', 'Asymmetrique Profile Score'
]

for column in numerical_columns_with_missing:
    df[column] = pd.to_numeric(df[column], errors='coerce')

for column in numerical_columns_with_missing:
    df[column].fillna(df[column].mode(), inplace=True)

df['Page Views Per Visit'].fillna(df['Page Views Per Visit'].mode(), inplace=True)
df['TotalVisits'].fillna(df['TotalVisits'].mode(), inplace=True)

categorical_columns_with_missing = [
    'Last Activity', 'Country', 'Specialization', "Lead Source",
    'How did you hear about X Education', 'What is your current occupation',
    'What matters most to you in choosing a course', 'Tags', 'Lead Quality', 'Lead Profile', 'City'
]

for column in categorical_columns_with_missing:
    df[column].fillna('Unknown', inplace=True)

In [None]:
columns= df.columns.to_list()

for x in columns:
    print(f"{x}: {df[x].isnull().sum()}")

In [None]:
categorical_columns = [
    'Lead Origin', 'Last Activity', 'Country', 'Specialization',
    'How did you hear about X Education', 'What is your current occupation',
    'What matters most to you in choosing a course', 'Tags', 'Lead Quality', 'Lead Profile', 'City'
]

df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

binary_columns = ['Do Not Email', 'Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 
                  'Newspaper', 'Digital Advertisement', 'Through Recommendations', 
                  'Receive More Updates About Our Courses', 'I agree to pay the amount through cheque', 
                  'A free copy of Mastering The Interview']

for column in binary_columns:
    df[column] = df[column].apply(lambda x: 1 if x == 'Yes' else 0)


In [None]:
numerical_features = ['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit', 
                      'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 
                      'Asymmetrique Activity Score', 'Asymmetrique Profile Score']

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])


In [None]:
df['TotalVisits'].hist(bins=50)
plt.title('Distribution of Total Visits')
plt.xlabel('TotalVisits')
plt.ylabel('Frequency')
plt.show()

In [None]:
columns= df.columns.to_list()

for x in columns:
    print(f"{x}: {df[x].dtype}")

In [None]:
numeric_columns = df.select_dtypes(include=['number']).columns
df_numeric = df[numeric_columns]
df_numeric.fillna(df_numeric.median(numeric_only=True), inplace=True)

In [None]:
correlation_matrix = df_numeric.corr()

print(correlation_matrix['Converted'].sort_values(ascending=False))

plt.figure(figsize=(12, 8))
plt.title('Correlation Matrix')
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.show()

In [None]:
columns= df_numeric.columns.to_list()
NegativeColumns= []
for x in columns:
    y= (df_numeric[x] > 0).count()
    if y < 0:
        NegativeColumns.append(x)
NegativeColumns
NegativeColumns.append('Lead Number')
NegativeColumns.append('Converted')

for x in NegativeColumns:
    df_numeric.drop(x, axis=1, inplace=True)

print(df_numeric.columns)
print(NegativeColumns)

In [None]:
X = df_numeric
y = df['Converted']

selector = SelectKBest(score_func=chi2, k=10)
selector.fit(abs(X), y)

scores = pd.DataFrame(selector.scores_, columns=['Score'], index=X.columns)
print(scores.sort_values(by='Score', ascending=False))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
sns.pairplot(df[['TotalVisits', 'Total Time Spent on Website', 'Converted']])
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
