### Step 1: Import All essential Libraries

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

### Step 2: Import Dataset & Explore it

In [None]:
# Loading Data
pd.set_option('display.max_columns', None)
ctrData = pd.read_csv('data.csv')
print(ctrData.head())

In [None]:
print(ctrData.shape)
print(ctrData.columns.values)
print(ctrData.info())

### Step 3: Check for missing values

In [None]:
print(ctrData.isnull().sum())

### The Data has no missing value
### Step 4: Now Lets, Visualise Categorical Data

In [None]:
# Labels against frequency 
import matplotlib.pyplot as plt
import seaborn as sb

plt.figure()
sb.countplot(x='y', data=ctrData)

plt.show()

In [None]:
sb.countplot(x = 'C1', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'site_category', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'app_category', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C14', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C15', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C16', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C17', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C18', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C19', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C20', hue = "y", data = ctrData)
plt.show()
sb.countplot(x = 'C21', hue = "y", data = ctrData)
plt.show()


### Step 5: Drop Unnecessary Values

In [None]:
#ctrData.drop(['device_id', 'C14', 'C17', 'C19', 'C20', 'C21'], axis=1, inplace=True)
#Drop variables that do might not have an impact on the output  
#ctrData.drop( ['month', 'dayofweek','day','hour'], axis=1, inplace = True )

### Step 6: Split data into Train & Test

In [None]:
# Select the features and the target variable
X = ['C1', 'banner_pos', 
       'device_type', 'device_conn_type', 'C15', 'C16', 'C18']
y = 'y'

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(ctrData[X], ctrData[y], test_size=0.2, random_state=42)

### Step 7: Use Logistic Regression

In [None]:
# Initialize the logistic regression model
model = LogisticRegression()

# Train the model
model.fit(x_train, y_train)

# Make predictions on the test set
predictions = model.predict(x_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
auc_roc = roc_auc_score(y_test, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC-ROC:", auc_roc)


### Step 8: Use Random Forest

In [None]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(x_train, y_train)

#Model Evaluation
y_pred = rf_classifier.predict(x_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC-ROC:", auc_roc)

Plotting Confusion Matrix

In [None]:
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# Plot and print confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
metrics.plot_confusion_matrix(model, x_test, y_test)
plt.show()

### Step 9: Use Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(x_train, y_train)

In [None]:
y_pred = dummy_clf.predict(x_test)

# Plot and print confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
metrics.plot_confusion_matrix(dummy_clf, x_test, y_test)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])

In [None]:
import matplotlib.pyplot as plt
metrics.plot_roc_curve(model, x_test, y_test) 
plt.show()