# Python Library Installation

In [12]:
!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn
!pip install jupyterlab


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# Library Import

In [13]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc
from matplotlib.legend_handler import HandlerLine2D
from imblearn.combine import SMOTETomek
from collections import Counter

# Importing and loading labeled csv file that is stored in Google Drive to this notebook


In [8]:

url='https://drive.google.com/file/d/196DQe4XGxjd__hUTwR4jndIOZgBbbzkv/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)
df.head()

# Convert numeric columns to float, handle non-numeric values
df = df.apply(pd.to_numeric, errors='coerce')

# Display the first few rows of the DataFrame
print(df.head())

# Min-Max normalization for each feature
normalized_df = (df - df.min()) / (df.max() - df.min())

# Display the first few rows of the normalized DataFrame
print(normalized_df.head())

"""# Dataframe Information Capture"""

df.info()

df['label'].describe()

df['Sent2'].describe()
df['Sent2'].value_counts()

normalized_df['sent_turns'].describe()

df['label'].value_counts()

normalized_df['T_Words/Turn'].describe()

normalized_df['Engagement'].describe()

engagement_array = np.array(df['Engagement'])
print(engagement_array)

df['Engagement'].value_counts()

"""# Dataframe Parameter Corelation Plots and SNS heatmap for Future use with SMOTE"""

df['label'] = df['label'].apply(lambda x: str(x))
plotdf = df.drop(df.columns[1],axis =1)
sns.pairplot(data=plotdf)

plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),cmap='viridis',annot=True,cbar=False)

   id  label  T_Words/Turn  C_Words/Turn  T_Turns  C_Turns  Ratio_of_turns  \
0 NaN    NaN     12.500000     19.000000       10        9        1.111111   
1 NaN    NaN     12.588235     13.277778       17       18        0.944444   
2 NaN    NaN     45.500000     17.428571        8        7        1.142857   
3 NaN    NaN     26.833333     15.000000       12       12        1.000000   
4 NaN    NaN     13.384615     27.833333       13       12        1.083333   

   T_and_C_WPT_Ratio  Engagement  Sem_All_Avg  Sem_Adj_Avg  Sent2  sent_turns  
0           0.657895           1       0.2853       0.3368      1           3  
1           0.948068           1       0.2721       0.2402      2           9  
2           2.610656           0       0.2572       0.2556      1           4  
3           1.788889           1       0.2991       0.2834      1           9  
4           0.480884           1       0.2514       0.2507      1           9  
   id  label  T_Words/Turn  C_Words/Turn   T_Turns 

<AxesSubplot:>

# Dataframe Normalization

In [None]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
import matplotlib.pyplot as plt
from collections import Counter

# Assuming df is your dataframe
cols_to_drop = ['id', 'label', 'Sent2']
df.drop(cols_to_drop, axis=1, inplace=True)

# Define features and target variable
all_features = df.columns[df.columns != 'Engagement']
output_var = 'Engagement'

# Number of runs
num_runs = 3  # Adjust as needed

# Set the desired sampling strategy
sampling_strategy = 0.8  # Adjust as needed

for run in range(num_runs):
    print(f"Run {run + 1}:")

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(df[all_features], df[output_var], test_size=0.2, random_state=run)

    # Apply SMOTE + Tomek oversampling on the training set for Random Forest
    smtom = SMOTETomek(sampling_strategy=sampling_strategy, random_state=139)
    x_train_smtom, y_train_smtom = smtom.fit_resample(x_train, y_train)

    # Building the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=85, max_depth=30)

    # Fit Random Forest model to training data
    rf_model.fit(x_train_smtom, y_train_smtom)

    # Model Evaluation for Random Forest

    # Accuracy Measure
    print("RF In-sample accuracy: %0.2f" % accuracy_score(y_train_smtom, rf_model.predict(x_train_smtom)))
    print("RF Out-of-sample accuracy: %0.2f" % accuracy_score(y_test, rf_model.predict(x_test)))

    # Apply SMOTE + Tomek oversampling on the training set for Gradient Boosting
    x_train_smtom_gbm, y_train_smtom_gbm = smtom.fit_resample(x_train, y_train)

    # Building the GBM model
    gbm_model = GradientBoostingClassifier()

    # Hyperparameter tuning for GBM
    param_grid_gbm = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }

    grid_search_gbm = GridSearchCV(estimator=gbm_model, param_grid=param_grid_gbm, scoring='accuracy', cv=3, n_jobs=-1)
    grid_search_gbm.fit(x_train_smtom_gbm, y_train_smtom_gbm)

    best_params_gbm = grid_search_gbm.best_params_
    print("Best GBM Hyperparameters:", best_params_gbm)

    # Fit the GBM model with the best parameters to the training data
    gbm_model = GradientBoostingClassifier(**best_params_gbm)
    gbm_model.fit(x_train_smtom_gbm, y_train_smtom_gbm)

    # Model Evaluation for GBM

    # Accuracy Measure
    print("GBM In-sample accuracy: %0.2f" % accuracy_score(y_train_smtom_gbm, gbm_model.predict(x_train_smtom_gbm)))
    print("GBM Out-of-sample accuracy: %0.2f" % accuracy_score(y_test, gbm_model.predict(x_test)))

    # Additional evaluation metrics
    print("RF Precision: {:.2f}".format(precision_score(y_test, rf_model.predict(x_test))))
    print("RF Recall: {:.2f}".format(recall_score(y_test, rf_model.predict(x_test))))
    print("GBM Precision: {:.2f}".format(precision_score(y_test, gbm_model.predict(x_test))))
    print("GBM Recall: {:.2f}".format(recall_score(y_test, gbm_model.predict(x_test))))

    # ROC and AUC curve for RF
    in_sample_prob_rf = rf_model.predict_proba(x_train_smtom)[:, 1]
    out_sample_prob_rf = rf_model.predict_proba(x_test)[:, 1]
    in_sample_fpr_rf, in_sample_tpr_rf, _ = roc_curve(y_train_smtom, in_sample_prob_rf)
    out_sample_fpr_rf, out_sample_tpr_rf, _ = roc_curve(y_test, out_sample_prob_rf)

    print("RF In-sample AUC: %0.4f" % auc(in_sample_fpr_rf, in_sample_tpr_rf))
    print("RF Out-of-sample AUC: %0.4f" % auc(out_sample_fpr_rf, out_sample_tpr_rf))

    # ROC and AUC curve for GBM
    in_sample_prob_gbm = gbm_model.predict_proba(x_train_smtom_gbm)[:, 1]
    out_sample_prob_gbm = gbm_model.predict_proba(x_test)[:, 1]
    in_sample_fpr_gbm, in_sample_tpr_gbm, _ = roc_curve(y_train_smtom_gbm, in_sample_prob_gbm)
    out_sample_fpr_gbm, out_sample_tpr_gbm, _ = roc_curve(y_test, out_sample_prob_gbm)

    print("GBM In-sample AUC: %0.4f" % auc(in_sample_fpr_gbm, in_sample_tpr_gbm))
    print("GBM Out-of-sample AUC: %0.4f" % auc(out_sample_fpr_gbm, out_sample_tpr_gbm))

    # Plot ROC curve for both models
    plt.figure(figsize=(10, 7))
    plt.plot(out_sample_fpr_rf, out_sample_tpr_rf, color='orange', label='RF Out-sample ROC curve (area= %0.2f)' % auc(out_sample_fpr_rf, out_sample_tpr_rf))
    plt.plot(in_sample_fpr_rf, in_sample_tpr_rf, color='blue', label='RF In-sample ROC curve (area= %0.2f)' % auc(in_sample_fpr_rf, in_sample_tpr_rf))
    plt.plot(out_sample_fpr_gbm, out_sample_tpr_gbm, color='green', label='GBM Out-sample ROC curve (area= %0.2f)' % auc(out_sample_fpr_gbm, out_sample_tpr_gbm))
    plt.plot(in_sample_fpr_gbm, in_sample_tpr_gbm, color='red', label='GBM In-sample ROC curve (area= %0.2f)' % auc(in_sample_fpr_gbm, in_sample_tpr_gbm))
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for RF and GBM Models')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()
