In [None]:
pip install pandas numpy scikit-learn requests sqlalchemy surprise




In [None]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive') # Mount to the existing '/content/drive' directory



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load IMDb datasets
try:
    basics = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/title.basics.tsv.gz', sep='\t', na_values='\\N')
    ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/title.ratings.tsv.gz', sep='\t', na_values='\\N')

    # Merge datasets on 'tconst'
    imdb_df = pd.merge(basics, ratings, on='tconst')
    imdb_df = imdb_df[['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes']]

    # Ensure 'tconst' is string type in imdb_df
    imdb_df['tconst'] = imdb_df['tconst'].astype(str)

    print("IMDb datasets loaded and merged successfully.")
    print(imdb_df.head())  # Display the first few rows of the merged dataset
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the file paths are correct and the files exist in the specified location.")

  basics = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/title.basics.tsv.gz', sep='\t', na_values='\\N')


IMDb datasets loaded and merged successfully.
      tconst            primaryTitle  startYear runtimeMinutes  \
0  tt0000001              Carmencita     1894.0            1.0   
1  tt0000002  Le clown et ses chiens     1892.0            5.0   
2  tt0000003          Pauvre Pierrot     1892.0            5.0   
3  tt0000004             Un bon bock     1892.0           12.0   
4  tt0000005        Blacksmith Scene     1893.0            1.0   

                     genres  averageRating  numVotes  
0         Documentary,Short            5.7      2062  
1           Animation,Short            5.6       279  
2  Animation,Comedy,Romance            6.5      2030  
3           Animation,Short            5.4       180  
4              Comedy,Short            6.2      2797  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df['tconst'] = imdb_df['tconst'].astype(str)


In [None]:
API_KEY = "6c22fc92dcd1b4859b2cc1d5d4105594"
BASE_URL = "https://api.themoviedb.org/3"

def fetch_tmdb_data(movie_id):
    endpoint = f"{BASE_URL}/movie/{movie_id}"
    params = {"api_key": API_KEY}
    response = requests.get(endpoint, params=params)
    return response.json()

# Fetch TMDb data for a subset of movies (limit to 1000 for demonstration)
tmdb_data = []
for tconst in imdb_df['tconst'][:1000]:
    try:
        tmdb_movie = fetch_tmdb_data(tconst)
        if 'id' in tmdb_movie:
            tmdb_movie['id'] = str(tmdb_movie['id'])  # Convert 'id' to string
            tmdb_data.append(tmdb_movie)
    except:
        continue

tmdb_df = pd.DataFrame(tmdb_data)
print("TMDb datasets loaded successfully.")
print(tmdb_df.head())  # Display the first few rows of the TMDb dataset

TMDb datasets loaded successfully.
   adult                     backdrop_path belongs_to_collection  budget  \
0  False  /pm9G1rfVvtBP32hNy4EwDEJpVtL.jpg                  None       0   
1  False  /pfeS2kjgvrjsCaKQ2NCFR22b9aR.jpg                  None       0   
2  False  /1V0506KOUCyegcXmndaWjnGl4wF.jpg                  None       0   
3  False                              None                  None       0   
4  False  /mDD99APoTgMuNJrkmAfGicooJHa.jpg                  None       0   

                                              genres  \
0                [{'id': 99, 'name': 'Documentary'}]   
1                  [{'id': 16, 'name': 'Animation'}]   
2  [{'id': 35, 'name': 'Comedy'}, {'id': 16, 'nam...   
3                  [{'id': 16, 'name': 'Animation'}]   
4  [{'id': 18, 'name': 'Drama'}, {'id': 99, 'name...   

                                            homepage     id    imdb_id  \
0                                                     16612  tt0000001   
1  https://www.emilerey

In [None]:
merged_df = pd.concat([imdb_df, tmdb_df], axis=1, join='inner')

In [None]:
# Feature engineering
merged_df['release_year'] = pd.to_datetime(merged_df['release_date']).dt.year
merged_df['budget_per_minute'] = merged_df['budget'] / merged_df['runtimeMinutes']
merged_df['revenue_per_minute'] = merged_df['revenue'] / merged_df['runtimeMinutes']
merged_df['profit'] = merged_df['revenue'] - merged_df['budget']
merged_df['roi'] = merged_df['profit'] / merged_df['budget']

In [None]:
# Select features for the model
features = ['startYear', 'runtimeMinutes', 'numVotes', 'budget', 'revenue', 'popularity', 'vote_count', 'vote_average', 'release_year', 'budget_per_minute', 'revenue_per_minute', 'profit', 'roi']
X = merged_df[features]
y = (merged_df['averageRating'] > 7).astype(int)  # Binary classification: 1 if rating > 7, 0 otherwise

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
    ])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Make sure to import the necessary modules
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier # Import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    """'Naive Bayes': GaussianNB(),"""
    'XGBoost': xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
results = {}

for name, model in models.items():
    try:
        print(f"Training {name}...")
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        results[name] = {}
        results[name]['Accuracy'] = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
        results[name]['Precision'] = precision
        results[name]['Recall'] = recall
        results[name]['F1-score'] = f1
        print(f"Finished evaluating {name}")
    except Exception as e:
        print(f"Error occurred while training/evaluating {name}: {str(e)}")

print("All models trained and evaluated")

Training Random Forest...
Finished evaluating Random Forest
Training Gradient Boosting...
Finished evaluating Gradient Boosting
Training SVM...
Finished evaluating SVM
Training KNN...
Finished evaluating KNN
Training 'Naive Bayes': GaussianNB(),XGBoost...
Finished evaluating 'Naive Bayes': GaussianNB(),XGBoost
All models trained and evaluated


In [None]:
from tabulate import tabulate
# Function to print results
def print_results(results):
    print("\nModel Performance Results:")

    df_results = pd.DataFrame(results).T
    df_results = df_results.round(4)
    df_results = df_results.sort_values('F1-score', ascending=False)

    print(tabulate(df_results, headers='keys', tablefmt='pretty'))

    print("\nDetailed Model Results:")
    for model_name, metrics in results.items():
        print(f"\n{model_name}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")

# Print results
print_results(results)


Model Performance Results:
+-------------------------------------+----------+-----------+--------+----------+
|                                     | Accuracy | Precision | Recall | F1-score |
+-------------------------------------+----------+-----------+--------+----------+
| 'Naive Bayes': GaussianNB(),XGBoost |  0.9739  |  0.9745   | 0.9739 |  0.9653  |
|            Random Forest            |  0.9673  |  0.9357   | 0.9673 |  0.9513  |
|                 SVM                 |  0.9673  |  0.9357   | 0.9673 |  0.9513  |
|                 KNN                 |  0.9673  |  0.9357   | 0.9673 |  0.9513  |
|          Gradient Boosting          |  0.9477  |  0.9351   | 0.9477 |  0.9414  |
+-------------------------------------+----------+-----------+--------+----------+

Detailed Model Results:

Random Forest:
  Accuracy: 0.9673
  Precision: 0.9357
  Recall: 0.9673
  F1-score: 0.9513

Gradient Boosting:
  Accuracy: 0.9477
  Precision: 0.9351
  Recall: 0.9477
  F1-score: 0.9414

SVM:
  Accura

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer

# Create a pipeline with an imputer and HistGradientBoostingClassifier
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
tree_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
            ]), numeric_features),
        ],
        remainder='passthrough' # Passthrough string columns to the classifier
    )),
    ('tree', HistGradientBoostingClassifier(random_state=42))
])

# Bagging
bagging = BaggingClassifier(estimator=tree_pipeline, n_estimators=10, random_state=42)

# Convert X_train to a DataFrame with appropriate column names (if it's not already)
X_train_df = pd.DataFrame(X_train, columns=X.columns)

bagging.fit(X_train_df, y_train)
y_pred_bagging = bagging.predict(X_test)

print("\nBagging Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_bagging, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_bagging, average='weighted'):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_bagging, average='weighted'):.4f}")

# Add results to the results dictionary
results['Bagging'] = {
    'Accuracy': accuracy_score(y_test, y_pred_bagging),
    'Precision': precision_score(y_test, y_pred_bagging, average='weighted'),
    'Recall': recall_score(y_test, y_pred_bagging, average='weighted'),
    'F1-score': f1_score(y_test, y_pred_bagging, average='weighted')
}

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to convert mixed-type columns to strings
def convert_to_string(X):
    for column in X.columns:
        if X[column].dtype == 'object' or X[column].apply(type).nunique() > 1:
            X[column] = X[column].astype(str)
    return X

# Function to handle missing values
def handle_missing(X):
    X = X.replace('missing', np.nan)
    numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
    X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].mean())
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    X[categorical_columns] = X[categorical_columns].fillna('Unknown')
    return X

# Assuming X and y are your features and target variables
# If not, replace X and y with your actual data

# Handle missing values and convert mixed-type columns to strings
X = handle_missing(X)
X = convert_to_string(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category', 'string']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create a pipeline that includes preprocessing and AdaBoost
boosting_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=100,
        random_state=42,
        algorithm='SAMME'
    ))
])

# Fit the pipeline
boosting_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_boosting = boosting_pipeline.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_boosting)
precision = precision_score(y_test, y_pred_boosting, average='weighted')
recall = recall_score(y_test, y_pred_boosting, average='weighted')
f1 = f1_score(y_test, y_pred_boosting, average='weighted')

print("\nBoosting (AdaBoost) Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Add results to the results dictionary (assuming you have a results dictionary)
results = {}  # If you don't have this dictionary already, uncomment this line
results['AdaBoost'] = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-score': f1
}

# If you want to use this model for predictions later
import joblib
joblib.dump(boosting_pipeline, 'adaboost_model.joblib')

# To load the model later, you can use:
# loaded_model = joblib.load('adaboost_model.joblib')


Boosting (AdaBoost) Results:
Accuracy: 0.9673
Precision: 0.9580
Recall: 0.9673
F1-score: 0.9605


['adaboost_model.joblib']

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

def fit_models(X, y, models):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    fitted_models = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        fitted_models[name] = model
    return fitted_models, X_test, y_test

def get_best_model(models, X_test, y_test):
    best_score = 0
    best_model = None
    for name, model in models.items():
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted')
        print(f"{name} F1-score: {score}")
        if score > best_score:
            best_score = score
            best_model = model
    return best_model

def get_recommendations(movie_title, merged_df, X, y, preprocessor, models, n=5):
    try:
        if movie_title not in merged_df['primaryTitle'].values:
            return f"Movie '{movie_title}' not found in the dataset."

        X_processed = preprocessor.fit_transform(X)

        fitted_models, X_test, y_test = fit_models(X_processed, y, models)

        best_model = get_best_model(fitted_models, X_test, y_test)

        movie_index = merged_df[merged_df['primaryTitle'] == movie_title].index[0]
        movie_features = X.iloc[movie_index:movie_index+1]
        movie_features_processed = preprocessor.transform(movie_features)

        # Calculate similarities
        similarities = cosine_similarity(X_processed, movie_features_processed).flatten()
        similar_indices = similarities.argsort()[::-1][1:101]  # Get top 100 similar movies

        # Get predictions for similar movies
        if hasattr(best_model, 'predict_proba'):
            predictions = best_model.predict_proba(X_processed[similar_indices])[:, 1]
        else:
            predictions = best_model.predict(X_processed[similar_indices])

        # Create recommendations dataframe
        recommendations = merged_df.iloc[similar_indices][['primaryTitle', 'averageRating', 'genres', 'numVotes']]
        recommendations['similarity'] = similarities[similar_indices]
        recommendations['prediction'] = predictions

        # Normalize similarity and prediction
        scaler = MinMaxScaler()
        recommendations['normalized_similarity'] = scaler.fit_transform(recommendations[['similarity']])
        recommendations['normalized_prediction'] = scaler.fit_transform(recommendations[['prediction']])

        # Calculate popularity score
        recommendations['popularity_score'] = recommendations['averageRating'] * np.log1p(recommendations['numVotes'])
        recommendations['normalized_popularity'] = scaler.fit_transform(recommendations[['popularity_score']])

        # Calculate final recommendation score
        alpha = 0.4  # Weight for prediction
        beta = 0.4   # Weight for similarity
        gamma = 0.2  # Weight for popularity
        recommendations['recommendation_score'] = (
            alpha * recommendations['normalized_prediction'] +
            beta * recommendations['normalized_similarity'] +
            gamma * recommendations['normalized_popularity']
        )

        # Sort and select top N recommendations
        top_recommendations = recommendations.sort_values('recommendation_score', ascending=False).head(n)

        return top_recommendations[['primaryTitle', 'averageRating', 'genres', 'similarity', 'prediction', 'recommendation_score']]

    except Exception as e:
        return f"An error occurred: {str(e)}"

# Example usage
print("\nMovie Recommendations:")
result = get_recommendations("Pauvre Pierrot", merged_df, X, y, preprocessor, models)
print(result)


Movie Recommendations:
Random Forest F1-score: 0.95125181855688
Gradient Boosting F1-score: 0.9446739676918705
SVM F1-score: 0.95125181855688
KNN F1-score: 0.95125181855688
'Naive Bayes': GaussianNB(),XGBoost F1-score: 0.95125181855688
              primaryTitle  averageRating             genres  \
11  The Arrival of a Train            7.4  Documentary,Short   
13     The Waterer Watered            7.1       Comedy,Short   
9      Leaving the Factory            6.8  Documentary,Short   
4         Blacksmith Scene            6.2       Comedy,Short   
14     Autour d'une cabine            6.1    Animation,Short   

                                               genres  similarity  prediction  \
11                [{'id': 99, 'name': 'Documentary'}]    0.741423        0.78   
13                     [{'id': 35, 'name': 'Comedy'}]    0.867667        0.55   
9                 [{'id': 99, 'name': 'Documentary'}]    0.841184        0.18   
4   [{'id': 18, 'name': 'Drama'}, {'id': 99, 'name... 