In [None]:
Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('leads.csv')

# Inspect the data
print(data.head())
print(data.info())
print(data.describe())

# Data Preprocessing
# Replace 'Select' levels with NaN
data.replace('Select', np.nan, inplace=True)

# Handle missing values
# Drop columns with a high percentage of missing values
data.dropna(axis=1, thresh=0.7*len(data), inplace=True)
# Fill remaining missing values with the mode for categorical variables and median for numerical variables
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)
for column in data.select_dtypes(include=['number']).columns:
    data[column].fillna(data[column].median(), inplace=True)

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Exploratory Data Analysis (EDA)
# Distribution of target variable
sns.countplot(x='Converted', data=data)
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f')
plt.show()

# Relationship between important features and target variable
sns.boxplot(x='Converted', y='Total Time Spent on Website', data=data)
plt.show()

# Feature Engineering
# Select relevant features based on domain knowledge and correlation analysis
features = ['Lead Source', 'Total Time Spent on Website', 'Total Visits', 'Last Activity']
X = data[features]
y = data['Converted']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train[['Total Time Spent on Website', 'Total Visits']] = scaler.fit_transform(X_train[['Total Time Spent on Website', 'Total Visits']])
# Model Building
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

# Feature Importance
feature_importances = pd.DataFrame(model.feature_importances_, index=features, columns=['Importance']).sort_values('Importance', ascending=False)
print(feature_importances)

# Visualize feature importance
feature_importances.plot(kind='bar')
plt.show()