In [1]:
import pandas as pd
# For train/test split using professor id 
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import SVC

## Cleaning Data

In [2]:
ratings_csv = pd.read_csv("MSU_ratings.csv")
ratings_csv

Unnamed: 0,prof_id,course,date,quality,difficulty,comments,thumbs_up,thumbs_down
0,790,MTH132,"Oct 9th, 2019",3.0,4.0,"Prof. Sagan is a nice, caring man, but he is n...",0,0
1,790,MTH317H,"Jan 3rd, 2014",5.0,3.0,AWESOME PROFESSOR!!!!!!!!!,0,0
2,790,MTH317H,"Nov 5th, 2013",5.0,4.0,Professor Sagan is really good math teacher. ...,0,0
3,790,MTH132,"Jan 11th, 2009",5.0,2.0,Excellent professor that was very clear in tea...,0,0
4,790,MATH254H,"Aug 10th, 2005",4.5,3.0,Awesome - made me want to become a Math major ...,0,0
...,...,...,...,...,...,...,...,...
13107,2650948,IAH211B,"Apr 25th, 2022",1.0,5.0,Anning Jing is an absolute abomination of a pr...,0,0
13108,2650948,IAH211B,"Apr 25th, 2022",1.0,5.0,Took IAH211B for university requirement credit...,0,0
13109,2650948,HA260,"Feb 2nd, 2022",1.0,5.0,Extremely vague. Skips around a LOT. Messy sli...,0,0
13110,2650948,IAH211B,"Apr 27th, 2021",1.0,5.0,This class has no reason for being harder than...,0,0


In [3]:
# Drop any ratings with no comments 
ratings_csv = ratings_csv.drop(ratings_csv[ratings_csv["comments"] == "No Comments"].index)
ratings_csv

Unnamed: 0,prof_id,course,date,quality,difficulty,comments,thumbs_up,thumbs_down
0,790,MTH132,"Oct 9th, 2019",3.0,4.0,"Prof. Sagan is a nice, caring man, but he is n...",0,0
1,790,MTH317H,"Jan 3rd, 2014",5.0,3.0,AWESOME PROFESSOR!!!!!!!!!,0,0
2,790,MTH317H,"Nov 5th, 2013",5.0,4.0,Professor Sagan is really good math teacher. ...,0,0
3,790,MTH132,"Jan 11th, 2009",5.0,2.0,Excellent professor that was very clear in tea...,0,0
4,790,MATH254H,"Aug 10th, 2005",4.5,3.0,Awesome - made me want to become a Math major ...,0,0
...,...,...,...,...,...,...,...,...
13107,2650948,IAH211B,"Apr 25th, 2022",1.0,5.0,Anning Jing is an absolute abomination of a pr...,0,0
13108,2650948,IAH211B,"Apr 25th, 2022",1.0,5.0,Took IAH211B for university requirement credit...,0,0
13109,2650948,HA260,"Feb 2nd, 2022",1.0,5.0,Extremely vague. Skips around a LOT. Messy sli...,0,0
13110,2650948,IAH211B,"Apr 27th, 2021",1.0,5.0,This class has no reason for being harder than...,0,0


In [4]:
X = ratings_csv["comments"]
# 1 for positive ratings, 0 for negative
y = ratings_csv["quality"]>2

In [5]:
# https://stackoverflow.com/questions/54797508/how-to-generate-a-train-test-split-based-on-a-group-id
# Train test split 70/30 based on professor id 
splitter = GroupShuffleSplit(test_size=.30, n_splits=2, random_state = 20)
split = splitter.split(ratings_csv, groups=ratings_csv['prof_id'])
train_idx, test_idx = next(split)
X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]

In [6]:
# Check that split was done properly 
print(f'Total Number of unique prof IDs: {len(pd.unique(ratings_csv["prof_id"]))}')
print(f'Number of unique prof IDs in train data: {len(pd.unique(ratings_csv.iloc[train_idx]["prof_id"]))}')
print(f'Number of unique prof IDs in test data: {len(pd.unique(ratings_csv.iloc[test_idx]["prof_id"]))}')

Total Number of unique prof IDs: 1662
Number of unique prof IDs in train data: 1163
Number of unique prof IDs in test data: 499


In [7]:
print(f"True split (based on number of ratings): {(len(X_train)/len(X))*100}% Train, {(len(X_test)/len(X))*100}% Test")

True split (based on number of ratings): 69.61006485895132% Train, 30.389935141048685% Test


## Preprocessing 

In [8]:
vectorizer = CountVectorizer()

In [9]:
X_train_vect = vectorizer.fit_transform(X_train)

In [10]:
X_test_vect = vectorizer.transform(X_test)

## SVM

In [11]:
svc = SVC()

In [12]:
svc.fit(X_train_vect,y_train)

In [13]:
y_test_pred = svc.predict(X_test_vect)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [15]:
print(f'Accuracy Score: {accuracy_score(y_test, y_test_pred)}')
print(f'Precision Score: {precision_score(y_test, y_test_pred)}')
print(f'Recall Score: {recall_score(y_test, y_test_pred)}')
print(f'F1 Score: {f1_score(y_test, y_test_pred)}')

Accuracy Score: 0.8755464129596298
Precision Score: 0.8917525773195877
Recall Score: 0.9388567293777135
F1 Score: 0.9146986253084244
