Importing the Sight Dataset and the Coursera Review Dataset and Preprocessing/Cleaning them

In [5]:
import pandas as pd

sight_df = pd.read_csv(
    "data/sight_dataset.csv",
    delimiter=",",               # Specify delimiter
    quotechar='"',               # Handle embedded quotes
    escapechar="\\",             # Escape special characters
    on_bad_lines="skip",         # Skip problematic lines
    engine="python"              # Use the Python parser for flexibility
)

# Upon analysis, we discovered that this dataset can be used to test since it is unlabelled.
#sight_df.head()

coursera_df = pd.read_csv("data/reviews_by_course.csv")
coursera_df.head()

Unnamed: 0,CourseId,Review,Label
0,2-speed-it,BOring,1
1,2-speed-it,Bravo !,5
2,2-speed-it,Very goo,5
3,2-speed-it,"Great course - I recommend it for all, especia...",5
4,2-speed-it,One of the most useful course on IT Management!,5


In [7]:
#Inspecting the data types and checking if there are any missing values
print(coursera_df.info())
print(coursera_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140320 entries, 0 to 140319
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   CourseId  140320 non-null  object
 1   Review    140317 non-null  object
 2   Label     140320 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ MB
None
CourseId    0
Review      3
Label       0
dtype: int64


In [9]:
# Since only 3 missing values, dropping them
coursera_df = coursera_df.dropna(subset=["Review"])
print(coursera_df.isnull().sum())

CourseId    0
Review      0
Label       0
dtype: int64


In [11]:
#Checking label distribution
print(coursera_df["Label"].value_counts())

Label
5    106514
4     22460
3      5923
1      2866
2      2554
Name: count, dtype: int64


In [13]:
'''At this point, given that the dataset is skewed with a lot more rows labelled 5 than not, given the choice of undersampling label 5, oversampling the lower labels 1-4,
and using class weights, we decided to go with class weights to avoid artificially adding data (fake reviews) to the dataset. 
'''

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

#Defining the classes and their freq

classes = sorted(coursera_df["Label"].unique())
print(classes)
class_weights = compute_class_weight(
    class_weight = "balanced",
    classes = classes,
    y=coursera_df["Label"]
)

#Converting to dictionary for easy ref
class_weights_dict = {classes[i]: class_weights[i] for i in range(len(classes))}
print(class_weights_dict)


[1, 2, 3, 4, 5]
{1: 9.791835310537333, 2: 10.988018794048552, 3: 4.738038156339693, 4: 1.2494835262689226, 5: 0.26347146853934694}


Upon visual inspection of the dataset, we noticed some issues:

1. Non-English Reviews: Some of the reviews are in spanish. In BERT is pre-trained on English text, this will cause problems and affect fine-tuning quality.
2. Gibberish and Encoding issues: We found that some of the reviews had plain gibberish text. eg: Ð”Ð¾ÑÑ‚ÑƒÐ¿Ð½Ð¾ Ð¸ Ð¸Ð½Ñ‚ÐµÑ€ÐµÑÐ½Ð¾. We will either correct the encoding errors or drop them.

In [14]:
# install langdetect and unidecode

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
def detect_language(text):
    try: 
        return detect(text)
    except LangDetectException:
        return "unknown"
    
#Detecting each review language
coursera_df["Language"] = coursera_df["Review"].apply(detect_language)

#filtering only English reviews
english_reviews_df = coursera_df[coursera_df["Language"]=="en"]

print(english_reviews_df["Language"].value_counts())
english_reviews_df.head()

Language
en    106847
Name: count, dtype: int64


Unnamed: 0,CourseId,Review,Label,Language
3,2-speed-it,"Great course - I recommend it for all, especia...",5,en
4,2-speed-it,One of the most useful course on IT Management!,5,en
5,2-speed-it,I was disappointed because the name is mislead...,3,en
6,2-speed-it,Super content. I'll definitely re-do the course,5,en
8,2-speed-it,One of the excellent courses at Coursera for i...,5,en


In [19]:
#Fixing giberrish

from unidecode import unidecode

english_reviews_df["cleaned_review"] = english_reviews_df["Review"].apply(unidecode)

print(english_reviews_df["cleaned_review"].head())
print(english_reviews_df.shape) 


3    Great course - I recommend it for all, especia...
4      One of the most useful course on IT Management!
5    I was disappointed because the name is mislead...
6      Super content. I'll definitely re-do the course
8    One of the excellent courses at Coursera for i...
Name: cleaned_review, dtype: object
(106847, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_reviews_df["cleaned_review"] = english_reviews_df["Review"].apply(unidecode)


In [17]:
#Checking whether original df had more langs

print(coursera_df["Language"].value_counts())



Language
en         106847
es          12270
fr           3639
zh-cn        2335
pt           2239
ro           2109
ca           1982
ru           1534
af           1532
so            976
de            767
it            538
no            412
tl            333
ko            329
cy            258
sw            256
id            225
unknown       206
nl            192
vi            179
pl            173
sl            152
da            136
cs             83
et             80
sk             79
sq             71
tr             52
sv             51
zh-tw          48
fi             41
hr             36
hu             32
lt             30
bg             30
mk             14
uk             13
he             11
ar             10
lv              9
fa              4
ja              2
ur              1
el              1
Name: count, dtype: int64


In [20]:
# Saving new csv
english_reviews_df.to_csv("data/coursera_english_reviews.csv", index=False)


Now that we only have English reviews, time to apply standard text preprocessing such as conversion to lowercase, punctuation removal, special character removal, and extra whitespace removal

In [21]:
import re

# Text cleaning function
def clean_text(text):
    text = text.lower()  
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Applying cleaning to the Review column
english_reviews_df["cleaned_review"] = english_reviews_df["Review"].apply(clean_text)

# Verifying the changes
print(english_reviews_df[["Review", "cleaned_review"]].head())

                                              Review  \
3  Great course - I recommend it for all, especia...   
4    One of the most useful course on IT Management!   
5  I was disappointed because the name is mislead...   
6    Super content. I'll definitely re-do the course   
8  One of the excellent courses at Coursera for i...   

                                      cleaned_review  
3  great course i recommend it for all especially...  
4     one of the most useful course on it management  
5  i was disappointed because the name is mislead...  
6       super content ill definitely redo the course  
8  one of the excellent courses at coursera for i...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_reviews_df["cleaned_review"] = english_reviews_df["Review"].apply(clean_text)


In [22]:
# Spliting the data
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(english_reviews_df, test_size=0.2, stratify=english_reviews_df["Label"], random_state=37)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

Training set size: 85477
Validation set size: 21370


Tokenizing using BERT from Hugging Face Transformers library

In [25]:
# Analyzing dataset to determine max_length for BERT tokenization
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

review_lengths = english_reviews_df["cleaned_review"].apply(lambda x: len(tokenizer.tokenize(x)))
print(review_lengths.describe())  # Check mean, median, and max token length


count    106847.000000
mean         26.535307
std          34.915780
min           1.000000
25%           8.000000
50%          16.000000
75%          33.000000
max        1425.000000
Name: cleaned_review, dtype: float64


In [27]:
print(english_reviews_df.columns)

Index(['CourseId', 'Review', 'Label', 'Language', 'cleaned_review'], dtype='object')


In [30]:
#Verifying GPU usage
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch version: 2.0.1+cu117
CUDA available: True
CUDA version: 11.7
GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [35]:
from transformers import BertTokenizer

# Load a tokenizer to test
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print("Transformers library is functional!")


Transformers library is functional!


In [36]:
# Given that token lenght rarely even touches 128, we will keep max_lenght as 128

def tokenize_data(data):
    return tokenizer(
        list(data["cleaned_review"]),
        padding = True,
        truncation = True,
        max_length = 128,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)

print("Tokenization has been completed!")


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.