In [5]:
#Group 11
#Ariba S., Sreya R. and Melaany A.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# download necessary NLTK packages
nltk.download('punkt')
nltk.download('wordnet')

# read the dataset
with open('musical1.tsv', 'r') as file:
    dataset = file.read()

# separate the reviews and scores
reviews = []
scores = []
for line in dataset.split('\n')[1:]:
    if line:
        review, score = line.split('\t')
        reviews.append(review.lower())
        scores.append(int(score))

# 1.tokenize the reviews
tokenized_reviews = [word_tokenize(review) for review in reviews]

# 2.perform stemming for the tokens of the reviews
porter = PorterStemmer()
stemmed_reviews = []
for review in tokenized_reviews:
    stemmed_review = [porter.stem(token) for token in review]
    stemmed_reviews.append(stemmed_review)

# 3.perform lemmatization for the stemmed tokens
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = []
for review in stemmed_reviews:
    lemmatized_review = [lemmatizer.lemmatize(token) for token in review]
    lemmatized_reviews.append(lemmatized_review)

# convert the list of reviews to string
corpus = [' '.join(review) for review in lemmatized_reviews]

# extract features from the corpus using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, scores, test_size=0.2, random_state=42)

# 4. build the Random Forest classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

# predict the scores for the test set
y_pred = rfc.predict(X_test)

# 5.evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {round(accuracy,3)}")
print(f"Precision: {round(precision,3)}")
print(f"Recall: {round(recall,3)}")
print(f"F1 Score: {round(f1,3)}")


[nltk_data] Downloading package punkt to /Users/sreyaroy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sreyaroy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.775
Precision: 0.785
Recall: 0.833
F1 Score: 0.809
