In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df_list = pd.read_csv('imdb_list.csv')
df_reviews = pd.read_csv('imdb_reviews.csv')

In [3]:
df_list.head()

Unnamed: 0.1,Unnamed: 0,id,title,rating,genre,year
0,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015
1,1,tt3774694,Love,6.1,"Drama, Romance",2015
2,2,tt2361509,The Intern,7.1,"Comedy, Drama",2015
3,3,tt2381249,Mission: Impossible - Rogue Nation,7.4,"Action, Adventure, Thriller",2015
4,4,tt3460252,The Hateful Eight,7.8,"Crime, Drama, Mystery",2015


In [4]:
df_reviews.head()

Unnamed: 0,imdb_id,review title,review_rating,review
0,tt0369610,"Spielberg Magic, This Is Not. Still, a Visit t...",7.0,You may have heard some critics champion Juras...
1,tt0369610,"Not a patch on the original Jurassic Park, and...",7.0,The original Jurassic Park still is a personal...
2,tt0369610,Its a Jurassic World after all.,7.0,The 4th film in the Jurassic Park series and t...
3,tt0369610,Worthy Sequel To One Of The Greatest Films Eve...,,Let's start this by stating how much of a die ...
4,tt0369610,Manages to somewhat return the Jurassic Park s...,6.0,Modernized and polished entry to the Jurassic ...


In [5]:
df_reviews.rename(columns={'imdb_id': 'id'}, inplace=True)

In [6]:
merged_df = pd.merge(df_list, df_reviews, on='id')
merged_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,rating,genre,year,review title,review_rating,review
0,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015,"Spielberg Magic, This Is Not. Still, a Visit t...",7.0,You may have heard some critics champion Juras...
1,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015,"Not a patch on the original Jurassic Park, and...",7.0,The original Jurassic Park still is a personal...
2,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015,Its a Jurassic World after all.,7.0,The 4th film in the Jurassic Park series and t...
3,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015,Worthy Sequel To One Of The Greatest Films Eve...,,Let's start this by stating how much of a die ...
4,0,tt0369610,Jurassic World,6.9,"Action, Adventure, Sci-Fi",2015,Manages to somewhat return the Jurassic Park s...,6.0,Modernized and polished entry to the Jurassic ...


In [7]:
merged_df.to_csv('merged_movie_data.csv', index=False)

In [8]:
df = pd.read_csv('merged_movie_data.csv')
df = df.dropna(subset=['review'])
df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 7 else 0)
X = df['review']
y = df['sentiment']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
param_grid = {
    'C': [0.1, 1, 10, 100],  
    'solver': ['liblinear', 'lbfgs', 'newton-cg'] 
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    cv=5, 
    scoring='accuracy',
    n_jobs=-1, 
    verbose=1 
)

In [12]:
grid_search.fit(X_train_vec, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [13]:
print(f"The bes parameters: {grid_search.best_params_}")
print(f"the vest valid precision: {grid_search.best_score_:.2f}")

The bes parameters: {'C': 10, 'solver': 'liblinear'}
the vest valid precision: 0.84


In [16]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_vec)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"test accuracy of the best model: {test_accuracy:.2f}")

test accuracy of the best model: 0.84


In [17]:
joblib.dump(best_model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']