In [3]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import sklearn.linear_model
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('imdb-movies-dataset.csv')
df.head()

Unnamed: 0,Poster,Title,Year,Certificate,Duration (min),Genre,Rating,Metascore,Director,Cast,Votes,Description,Review Count,Review Title,Review
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023.0,R,115.0,"Comedy, Drama, Romance",6.4,67.0,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023.0,PG-13,145.0,"Action, Adventure, Sci-Fi",7.3,66.0,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023.0,PG-13,97.0,"Biography, Comedy, History",5.5,42.0,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",333,not funny,Pretty much the worst criticism you can lay on...
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023.0,PG-13,126.0,"Action, Comedy, Drama",7.3,73.0,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023.0,R,131.0,"Drama, Romance, Sport",7.7,82.0,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...",194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...


In [5]:
df['Rating'].unique

<bound method Series.unique of 0       6.4
1       7.3
2       5.5
3       7.3
4       7.7
       ... 
9995    6.5
9996    7.5
9997    6.5
9998    6.4
9999    7.5
Name: Rating, Length: 10000, dtype: float64>

In [6]:
drop_cols = ['Poster', 'Genre', 'Director', 'Cast', 'Year', 'Metascore', 'Votes', 'Description']

In [7]:
df = df.drop(columns=drop_cols)
df

Unnamed: 0,Title,Certificate,Duration (min),Rating,Review Count,Review Title,Review
0,The Idea of You,R,115.0,6.4,166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a..."
1,Kingdom of the Planet of the Apes,PG-13,145.0,7.3,183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a..."
2,Unfrosted,PG-13,97.0,5.5,333,not funny,Pretty much the worst criticism you can lay on...
3,The Fall Guy,PG-13,126.0,7.3,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...
4,Challengers,R,131.0,7.7,194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...
...,...,...,...,...,...,...,...
9995,The Greatest Show on Earth,U,152.0,6.5,128,"Hey, doesn't anyone remember Last Emperor?",It constantly amazes me that people carp that ...
9996,Berserk: Ougon Jidai-hen I - Haou no Tamago,,76.0,7.5,12,Masterfully directed climatic epic saga,Few stories can capture your mind and soul in ...
9997,Is-slottet,,78.0,6.5,4,Beautiful Film,"This film might not be to everyone's taste, it..."
9998,Loving Pablo,A,123.0,6.4,84,That film should be in Spanish,Why anyone (the director?) made Spanish actors...


In [8]:
df = df.dropna()

In [9]:
def sentiment_analysis(rating):
    if rating < 2.5:
        return -1 + (rating / 2.5)  
    elif rating < 5:
        return -0.5 + ((rating - 2.5) / 2.5) * 0.5 
    elif rating < 7.5:
        return (rating - 5) / 2.5 * 0.5 
    else:
        return 0.5 + ((rating - 7.5) / 2.5) * 0.5 

df['Sentiment'] = df['Rating'].apply(sentiment_analysis)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = df['Rating'].apply(sentiment_analysis)


Unnamed: 0,Title,Certificate,Duration (min),Rating,Review Count,Review Title,Review,Sentiment
0,The Idea of You,R,115.0,6.4,166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a...",0.28
1,Kingdom of the Planet of the Apes,PG-13,145.0,7.3,183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a...",0.46
2,Unfrosted,PG-13,97.0,5.5,333,not funny,Pretty much the worst criticism you can lay on...,0.10
3,The Fall Guy,PG-13,126.0,7.3,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...,0.46
4,Challengers,R,131.0,7.7,194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...,0.54
...,...,...,...,...,...,...,...,...
9990,The Masque of the Red Death,A,89.0,6.9,109,Price at his Most Wicked!,Roger Corman has done an outstanding job with ...,0.38
9993,South Central,R,98.0,6.8,20,"""South Central was adapted from my novel ""Sout...",I would like to thank all of the fans and supp...,0.36
9994,Mutiny on the Bounty,U,132.0,7.6,112,"Our Favorite ""Mutiny"": April 28, 1789",Although the versions with Marlon Brando and T...,0.52
9995,The Greatest Show on Earth,U,152.0,6.5,128,"Hey, doesn't anyone remember Last Emperor?",It constantly amazes me that people carp that ...,0.30


In [10]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [11]:
X = tfidf.fit_transform(df['Review'])
y = df['Sentiment']

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 42)


In [17]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
pred_gbr = model.predict(X_test)
mse = mean_squared_error(y_test, pred_gbr)
r2 = r2_score(y_test, pred_gbr)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')

In [None]:
svr = SVR(kernel = 'rbf', C = 1.0, epsilon=0.2)
svr.fit(X_train, y_train)
pred_svr - svr.predict(X_test)
mse = mean_squared_error(y_test, pred_svr)
r2 = r2_score(y_test, pred_svr)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')

Mean Squared Error: 0.029205120800784228
R² Score: 0.22090178921855685


In [16]:
for actual, predicted in zip(y_test, pred_gbr):
    print(f"Actual: {actual}, Predicted (Gradient Boost): {predicted}")

Actual: -0.22000000000000003, Predicted: 0.19704101962637618
Actual: -0.09999999999999998, Predicted: 0.20832623069211562
Actual: 0.3, Predicted: 0.24076187781219707
Actual: -0.11999999999999994, Predicted: 0.18928203293828313
Actual: 0.5199999999999999, Predicted: 0.33225964305952876
Actual: 0.38000000000000006, Predicted: 0.3197195141124418
Actual: 0.25999999999999995, Predicted: 0.26651357254898717
Actual: 0.1, Predicted: 0.2792164995098009
Actual: 0.45999999999999996, Predicted: 0.35402619083060444
Actual: 0.41999999999999993, Predicted: 0.29036521229560475
Actual: 0.2800000000000001, Predicted: 0.3297744314513173
Actual: 0.54, Predicted: 0.29896465827849533
Actual: 0.36, Predicted: 0.2650691028848413
Actual: 0.15999999999999998, Predicted: 0.18509516556661226
Actual: 0.3, Predicted: 0.27696331193436735
Actual: 0.41999999999999993, Predicted: 0.3412137929270993
Actual: 0.38000000000000006, Predicted: 0.41227232869690383
Actual: 0.3, Predicted: 0.3363646360333023
Actual: 0.480000000

In [None]:
for actual, predicted in zip(y_test, pred_):
    print(f"Actual: {actual}, Predicted (Gradient Boost): {predicted}")