In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

file_path = '/content/drive/My Drive/RMP_data.csv'
df = pd.read_csv(file_path)
filtered_df = df[['comments', 'student_star']]
print(filtered_df.head())


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

                                            comments  student_star
0  This class is hard, but its a two-in-one gen-e...           5.0
1  Definitely going to choose Prof. Looney\'s cla...           5.0
2  I overall enjoyed this class because the assig...           4.0
3  Yes, it\'s possible to get an A but you\'ll de...           5.0
4  Professor Looney has great knowledge in Astron...           5.0


**Give Appropriate Label to feedback based on rating and remove garbage data**

---



In [None]:
import pandas as pd

def assign_label(star):
    if 3.5 <= star <= 5.0:
        return 1
    elif 2.5 < star < 3.5:
        return 0
    elif 1.0 <= star <= 2.5:
        return -1
    else:
        return None

filtered_df.loc[:, 'label'] = filtered_df['student_star'].apply(assign_label)
filtered_df = filtered_df[filtered_df['comments'] != "No Comments"]
filtered_df = filtered_df[filtered_df['comments'].str.strip() != '']
filtered_df.dropna(subset=['comments'], inplace=True)

print(filtered_df.head())


                                            comments  student_star  label
0  This class is hard, but its a two-in-one gen-e...           5.0    1.0
1  Definitely going to choose Prof. Looney\'s cla...           5.0    1.0
2  I overall enjoyed this class because the assig...           4.0    1.0
3  Yes, it\'s possible to get an A but you\'ll de...           5.0    1.0
4  Professor Looney has great knowledge in Astron...           5.0    1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'label'] = filtered_df['student_star'].apply(assign_label)


In [None]:
filtered_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1.0,12396
-1.0,5299
0.0,1331


**Assuring that our data has equal proportions of all three types of feedback**


In [None]:
print(filtered_df['label'].value_counts())

positive_feedback = filtered_df[filtered_df['label'] == 1.0]
negative_feedback = filtered_df[filtered_df['label'] == -1.0]
neutral_feedback = filtered_df[filtered_df['label'] == 0.0]

min_class_size = len(neutral_feedback)
positive_sampled = positive_feedback.sample(min_class_size, random_state=42)
negative_sampled = negative_feedback.sample(min_class_size, random_state=42)
neutral_sampled = neutral_feedback.sample(min_class_size, random_state=42)

balanced_df = pd.concat([positive_sampled, negative_sampled, neutral_sampled])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_df['label'].value_counts())

print(balanced_df.head())


label
 1.0    12396
-1.0     5299
 0.0     1331
Name: count, dtype: int64
label
-1.0    1331
 1.0    1331
 0.0    1331
Name: count, dtype: int64
                                            comments  student_star  label
0  This guy\'s a genius, but is out of touch w/ t...           2.5   -1.0
1  Todd makes learning algebra fun and I learned ...           5.0    1.0
2  She spent far too much time writing code on th...           2.5   -1.0
3  Professor Catalano is AWESOME. he is really pa...           5.0    1.0
4  Attention Deficit Disorder describes Brian. Ca...           3.0    0.0


**Word Lemmetization and Tokenization**

In [None]:

balanced_df['comments'] = balanced_df['comments'].astype(str)
balanced_df['comments'] = balanced_df['comments'].apply(lambda x: x.lower() if isinstance(x, str) else x)
balanced_df['comments'] = [word_tokenize(entry) for entry in balanced_df['comments']]

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

text_final = []

for entry in balanced_df['comments']:
    if isinstance(entry, list):
        final_words = []
        for word, tag in pos_tag(entry):
            if word not in stop_words and word.isalpha():
                final_word = lemmatizer.lemmatize(word, tag_map[tag[0]])
                final_words.append(final_word)
        text_final.append(" ".join(final_words))
    else:
        text_final.append("")

balanced_df['text_final'] = text_final
print(balanced_df[['comments', 'text_final']].head())


                                            comments  \
0  [this, guy\, 's, a, genius, ,, but, is, out, o...   
1  [todd, makes, learning, algebra, fun, and, i, ...   
2  [she, spent, far, too, much, time, writing, co...   
3  [professor, catalano, is, awesome, ., he, is, ...   
4  [attention, deficit, disorder, describes, bria...   

                                          text_final  
0             genius touch class lecture light speed  
1  todd make learn algebra fun learn lot always t...  
2  spend far much time write code board julie lut...  
3  professor catalano awesome really passionate t...  
4  attention deficit disorder describe brian sit ...  


In [None]:
balanced_df.dropna(subset=['text_final'], inplace=True)
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(balanced_df['text_final'],balanced_df['label'],test_size=0.15, random_state = 57)
print(type(Train_X))
print(Train_X.shape,Train_Y.shape)
Train_Y.value_counts()

<class 'pandas.core.series.Series'>
(3394,) (3394,)


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,1146
-1.0,1135
1.0,1113


**Encode the labels**

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

print("Classes:", Encoder.classes_)

Classes: [0 1 2]


**TF-IDF Vectorization**

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=8000)
Tfidf_vect.fit(balanced_df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

print(Test_X_Tfidf.shape)

(599, 6009)


**Train the model using Support Vector Machine**

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)

output = pd.DataFrame(data={"Text": Test_X, "Result": predictions_SVM, "Actual": Test_Y})

print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y) * 100)
print("SVM Precision Score -> ", precision_score(Test_Y, predictions_SVM, average='weighted') * 100)
print("SVM Recall Score -> ", recall_score(Test_Y, predictions_SVM, average='weighted') * 100)
print("SVM F1 Score -> ", f1_score(Test_Y, predictions_SVM, average='weighted') * 100)
matrix = confusion_matrix(Test_Y, predictions_SVM)
print("Confusion Matrix:")
print(matrix)
output.to_csv(r'result12.csv', index=False)

# res = output['Result'].value_counts()
# POS = (res[1]) / (res[1] + res[0]) * 100
# print("Number of Positive Review:", POS, "%")


SVM Accuracy Score ->  58.764607679465776
SVM Precision Score ->  60.75893008667491
SVM Recall Score ->  58.764607679465776
SVM F1 Score ->  59.35668495280427
Confusion Matrix:
[[119  60  17]
 [ 38 104  43]
 [ 18  71 129]]


**Saving model files to deploy on huggingface**

In [None]:
import joblib
joblib.dump(SVM, 'svm_model.pkl')
joblib.dump(Tfidf_vect, 'tfidf_vectorizers.pkl')




['tfidf_vectorizers.pkl']

**Sample outputs**

In [None]:
output = pd.DataFrame(data={"Text": Test_X, "Predicted": predictions_SVM, "Actual": Test_Y})
print(output.sample(10))


                                                   Text  Predicted  Actual
3688  ['boring', 'ramble', 'without', 'ever', 'get',...          0       0
4855  ['lot', 'information', 'want', 'miss', 'class'...          0       1
693   ['go', 'class', 'read', 'assigned', 'material'...          1       0
5402  ['bore', 'instructor', 'lecture', 'book', 'tes...          1       0
2924  ['guy', 'think', 'he', 'tough', 'stuff', 'dont...          2       0
240                        ['brilliant', 'easy', 'eye']          2       2
5557  ['know', 'use', 'scare', 'tactic', 'weed', 'ba...          1       0
4245                         ['best', 'prof', 'campus']          2       2
6129  ['nice', 'smart', 'class', 'cover', 'material'...          0       0
1977  ['great', 'professor', 'knowledgable', 'field'...          2       2
