In [1]:
import numpy as np
import pandas as pd
import nlpaug.augmenter.word as naw
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Loading data
df = pd.read_csv('Data\\balanced_augmented_Call_Conversation(stemming).csv')
print(df.shape)

(602, 4)


In [3]:
df.head(3)

Unnamed: 0,VoiceClip_No,TextTranscript,label,processed_content
0,Voice157,check tell balanc,Non,check tell balanc
1,Voice212,nois come telephon internet work,Non,nois come telephon internet work
2,Voice142,would like add also want upgard internert packag,Non,would like add also want upgard internert packag


In [4]:
x = df['TextTranscript']
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [6]:
# TF-IDF Vectorization (converts the text data(comments) into numerical features)
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Hyperparameter grid to search
param_grid_rf = {
    'n_estimators': [100,200,300],  # Number of trees
    'max_depth': [None, 10, 20],  # Maximum depth of trees
    'min_samples_split': [15,20,25],  # Minimum samples required to split a node
    'min_samples_leaf': [5,10,15]  # Minimum samples required at a leaf node
}

# Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create K-Fold cross-validator
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Grid search with cross-validation on the training data to get the best hyperparameters
grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_grid_rf, cv=kf)
grid_search_rf.fit(X_train_tfidf, y_train)

# Best hyperparameters from the grid search
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters for Random Forest:", best_params_rf)

Best Hyperparameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 200}


In [7]:
# Training the final Random Forest model with the best hyperparameters
final_rf_classifier = RandomForestClassifier(**best_params_rf)
final_rf_classifier.fit(X_train_tfidf, y_train)

In [8]:
y_pred_rf = final_rf_classifier.predict(X_test_tfidf)

# Evaluated the final Random Forest model on the testing dataset
report_rf = classification_report(y_test, y_pred_rf)
print("\nClassification Report for Random Forest on Testing Data:")
print(report_rf)


Classification Report for Random Forest on Testing Data:
              precision    recall  f1-score   support

         Agg       0.92      0.86      0.89        63
         Non       0.85      0.91      0.88        58

    accuracy                           0.88       121
   macro avg       0.89      0.89      0.88       121
weighted avg       0.89      0.88      0.88       121



In [9]:
# Predict class labels on the training dataset
y_train_pred_rf = final_rf_classifier.predict(X_train_tfidf)

# Predict class labels on the testing dataset
y_test_pred_rf = final_rf_classifier.predict(X_test_tfidf)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred_rf)
print(f'\nTraining Accuracy: {train_accuracy:.4f}')

# Calculate testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred_rf)
print(f'Testing Accuracy: {test_accuracy:.4f}')


Training Accuracy: 0.9501
Testing Accuracy: 0.8843


In [10]:
Text = """can know the current price the fiber connection"""
# Used the trained model and vectorizer to get the prediction
prediction = final_rf_classifier.predict(vectorizer.transform([Text]))
# Display the prediction
print("Predicted Label:", prediction[0])

Predicted Label: Non


In [14]:
Text = """what the bloody hell is wrong with your agent"""
# Used the trained model and vectorizer to get the prediction
prediction = final_rf_classifier.predict(vectorizer.transform([Text]))
# Display the prediction
print("Predicted Label:", prediction[0])

Predicted Label: Agg


In [16]:
Text = """Disappointed in your service"""
# Used the trained model and vectorizer to get the prediction
prediction = final_rf_classifier.predict(vectorizer.transform([Text]))
# Display the prediction
print("Predicted Label:", prediction[0])

Predicted Label: Non


In [13]:
Text = """Are you freaking kinding me.Have you lost it"""
# Used the trained model and vectorizer to get the prediction
prediction = final_rf_classifier.predict(vectorizer.transform([Text]))
# Display the prediction
print("Predicted Label:", prediction[0])

Predicted Label: Non


Model not predicting the labels correctly