In [63]:
# importing necessary libraries
import pandas as pd

In [64]:
# importing csv files
course_reviews_data = pd.read_csv("2024_02_21_all_reviews", index_col=0)
df = pd.DataFrame(course_reviews_data)

In [65]:
df.head()

Unnamed: 0,course_code,status_code,star_count,prof_name,prof_slug,grade,reviewer_name,date,review_text,review_id
0,NFSC112,200,4,Abani Pradhan,/professor/pradhan,Expecting an A,Anonymous,12/23/2023,"Didn't learn anything, but didn't need to. Eas...",review-31256
1,NFSC112,200,5,Qin Wang,/professor/wang_qin,Expecting an A,Anonymous,12/18/2023,Easy class. The lectures are asynchronous but ...,review-30485
2,NFSC112,200,5,Xiaoxue Jia,/professor/jia_xiaoxue,Expecting an A,Anonymous,12/13/2023,She is amazing with ADS and helpful with answe...,review-29852
3,NFSC112,200,2,Abani Pradhan,/professor/pradhan,Expecting a B-,Anonymous,12/06/2023,Easy class as long as you actually watch the l...,review-28855
4,NFSC112,200,5,Qin Wang,/professor/wang_qin,Expecting an A,Anonymous,11/26/2023,"she is a good lecturer, nfsc 112, is ""chill,"" ...",review-28192


In [66]:
df.star_count.unique()

array([4, 5, 2, 3, 1])

In [67]:
df.grade.unique()

array(['Expecting an A', 'Expecting a B-', 'Expecting an A+',
       'Expecting a B+', 'Expecting an A-', 'Expecting a B', nan,
       'Expecting a C', 'Expecting a b', 'Expecting a C+',
       'Expecting an F', 'Expecting a D', 'Expecting a W',
       'Expecting a P', 'Expecting a a', 'Expecting a B?',
       'Expecting a c?', 'Expecting a c-', 'Expecting a C-',
       'Expecting a D+', 'Expecting a 65', 'Expecting a b+',
       'Expecting a c', 'Expecting an XF', 'Expecting a ?',
       'Expecting a D-', 'Expecting a A?', 'Expecting a AA',
       'Expecting a -A', 'Expecting a s', 'Expecting a b-',
       'Expecting a C?', 'Expecting a d', 'Expecting a I',
       'Expecting a ??', 'Expecting a a-', 'Expecting a -',
       'Expecting a 74', 'Expecting a BC', 'Expecting a CD',
       'Expecting a F+', 'Expecting a CB', 'Expecting a 85',
       'Expecting a 95', 'Expecting a AU'], dtype=object)

In [68]:
df.isnull().sum()

course_code         0
status_code         0
star_count          0
prof_name           0
prof_slug           0
grade            4135
reviewer_name       0
date                0
review_text         5
review_id           0
dtype: int64

In [69]:
df.duplicated().sum()

19

In [70]:
df.reviewer_name.value_counts()

Anonymous       22644
akl2025            27
zhangsta           25
florm              22
a778999            21
                ...  
wassupgirl          1
Anonymous515        1
sccerkid02          1
iwilder             1
BioBoy05            1
Name: reviewer_name, Length: 1690, dtype: int64

In [71]:
df.review_id.value_counts()

review-31256    2
review-27350    2
review-30485    2
review-2464     2
review-2463     2
               ..
review-29341    1
review-30428    1
review-31267    1
review-4301     1
review-31530    1
Name: review_id, Length: 26144, dtype: int64

### Data Preprocessing

In [72]:
# dropping unnecessary columns
df.drop(columns=['prof_slug'], inplace=True)
df.drop(columns=['status_code'], inplace=True)

In [73]:
df.columns

Index(['course_code', 'star_count', 'prof_name', 'grade', 'reviewer_name',
       'date', 'review_text', 'review_id'],
      dtype='object')

In [74]:
# dropping duplicates
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [75]:
# handling null values in grade column with imputation of word "None"
df["grade"] = df["grade"].fillna('None')
df["grade"].isnull().sum()

0

In [76]:
# handling null values in review_text column with imputation of words "No Review"
df["review_text"] = df["review_text"].fillna('No Review')
df["review_text"].isnull().sum()

0

In [96]:
# cleaning up grade column by standardizing letter grades
grades_to_standardize = ['A', 'B', 'C', 'D', 'F']

for grade in grades_to_standardize:
    df['grade'] = df['grade'].str.replace(f'(((\+|-)?{grade}(\+|-)?)|((\+|-)?{grade.lower()}(\+|-)?))(\?)?', grade, regex=True)

df["grade"].value_counts()

A       15804
B        4630
None     4135
C        1145
D         136
W         113
P          80
F          71
?           8
XF          6
BC          3
??          2
95          1
85          1
CB          1
CD          1
AA          1
74          1
-           1
I           1
s           1
65          1
AU          1
Name: grade, dtype: int64

In [99]:
# handling individual cases
df['grade'] = df['grade'].str.replace('95', "A", regex=True)
df['grade'] = df['grade'].str.replace('AA', "A", regex=True)
df['grade'] = df['grade'].str.replace('AU', "A", regex=True)
df['grade'] = df['grade'].str.replace('85', "B", regex=True)
df['grade'] = df['grade'].str.replace('BC', "B", regex=True)
df['grade'] = df['grade'].str.replace('74', "C", regex=True)
df['grade'] = df['grade'].str.replace('CB', "C", regex=True)
df['grade'] = df['grade'].str.replace('CD', "C", regex=True)
df['grade'] = df['grade'].str.replace('65', "D", regex=True)
df['grade'] = df['grade'].str.replace('XF', "F", regex=True)
df['grade'] = df['grade'].str.replace('W', "F", regex=True)
df['grade'] = df['grade'].str.replace('I', "None", regex=True)
df['grade'] = df['grade'].str.replace('s', "None", regex=True)
df['grade'] = df['grade'].str.replace('-', "None", regex=True)
df['grade'] = df['grade'].str.replace('\?+', "None", regex=True)
df["grade"].value_counts()

A       15807
B        4634
None     4148
C        1148
F         190
D         137
P          80
Name: grade, dtype: int64