In [1]:
import pandas as pd 
import numpy as np 


In [20]:
def read_tsv(file_path):
    return pd.read_csv(file_path, sep='\t', encoding='latin1')

data = read_tsv('../Data/rt.reviews.tsv')
data.head()


Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [21]:
# Start data cleaning process
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [22]:
# check if there are any missing values
data.isnull().sum()



id                0
review         5563
rating        13517
fresh             0
critic         2722
top_critic        0
publisher       309
date              0
dtype: int64

In [23]:
# if there are missing values, we can drop them
data = data.dropna()


In [24]:
# check again for missing values
data.isnull().sum()

id            0
review        0
rating        0
fresh         0
critic        0
top_critic    0
publisher     0
date          0
dtype: int64

In [25]:
# Check for any duplicates
data.duplicated().sum()

np.int64(0)

In [None]:
# Other ways for data cleaning are like removing special characters, converting text to lowercase, removing stop words, and stemming or lemmatization.
# So in our tsv file we have reviews column which contains text data. We will perform text cleaning on that column.
import re # this is for regular expressions meaning removing special characters
from nltk.stem import PorterStemmer # this is for stemming 

# Initialize the stemmer
stemmer = PorterStemmer()

# Define a function for text cleaning
def clean_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Perform stemming ( Stemming is the process of reducing a word to its root form example: running -> run )
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# Apply the cleaning function to the reviews column
data['review'] = data['review'].apply(clean_text)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33988 entries, 0 to 54424
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          33988 non-null  int64 
 1   review      33988 non-null  object
 2   rating      33988 non-null  object
 3   fresh       33988 non-null  object
 4   critic      33988 non-null  object
 5   top_critic  33988 non-null  int64 
 6   publisher   33988 non-null  object
 7   date        33988 non-null  object
dtypes: int64(2), object(6)
memory usage: 2.3+ MB


In [34]:
# Step 1: Keep only valid letter grades
letter_grades = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
data['rating_clean'] = data['rating'].apply(lambda x: x if x in letter_grades else np.nan)

# Step 2: Map letter grades to descriptive labels
grade_mapping = {
    'A+': 'Excellent', 'A': 'Excellent', 'A-': 'Excellent',
    'B+': 'Above Average', 'B': 'Above Average', 'B-': 'Above Average',
    'C+': 'Average', 'C': 'Average', 'C-': 'Average',
    'D': 'Below Average',
    'F': 'Fail'
}

data['rating_clean'] = data['rating_clean'].map(grade_mapping)

# Step 3: Optionally, drop rows with NaN (non-letter grades)
data = data.dropna(subset=['rating_clean'])

data.head()


Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date,rating_clean
6,3,quickli grow repetit and tiresom meander towar...,C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013",Average
11,3,while not one of cronenberg s stronger film th...,B-,fresh,Emanuel Levy,0,EmanuelLevy.Com,"February 3, 2013",Above Average
13,3,the anger over the injustic of the financi col...,B,fresh,Robert Roten,0,Laramie Movie Scope,"January 7, 2013",Above Average
17,3,it major problem is that it s not cinemat,B,fresh,Dennis Schwartz,0,Ozus' World Movie Reviews,"September 25, 2012",Above Average
34,3,i don t know if delillo s fetish doubletalk ev...,C,rotten,Vincent Mancini,0,FilmDrunk,"August 30, 2012",Average


In [32]:
data.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date,rating_clean
6,3,quickli grow repetit and tiresom meander towar...,C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013",C
11,3,while not one of cronenberg s stronger film th...,B-,fresh,Emanuel Levy,0,EmanuelLevy.Com,"February 3, 2013",B-
13,3,the anger over the injustic of the financi col...,B,fresh,Robert Roten,0,Laramie Movie Scope,"January 7, 2013",B
17,3,it major problem is that it s not cinemat,B,fresh,Dennis Schwartz,0,Ozus' World Movie Reviews,"September 25, 2012",B
34,3,i don t know if delillo s fetish doubletalk ev...,C,rotten,Vincent Mancini,0,FilmDrunk,"August 30, 2012",C


In [35]:
# Making a new cleaned data file
data.to_csv('../Data/cleaned_rt_reviews.csv', index=False)
data.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date,rating_clean
6,3,quickli grow repetit and tiresom meander towar...,C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013",Average
11,3,while not one of cronenberg s stronger film th...,B-,fresh,Emanuel Levy,0,EmanuelLevy.Com,"February 3, 2013",Above Average
13,3,the anger over the injustic of the financi col...,B,fresh,Robert Roten,0,Laramie Movie Scope,"January 7, 2013",Above Average
17,3,it major problem is that it s not cinemat,B,fresh,Dennis Schwartz,0,Ozus' World Movie Reviews,"September 25, 2012",Above Average
34,3,i don t know if delillo s fetish doubletalk ev...,C,rotten,Vincent Mancini,0,FilmDrunk,"August 30, 2012",Average
