In [1]:
import numpy as np
import pandas as pd
import random
import statsmodels.api as sm
from scipy.stats import spearmanr
from numpy import ndarray

In [2]:
pip install crowd-kit



In [3]:
data = {
    "movie": [
        "The Shawshank Redemption",
        "Schindler's List",
        "Interstellar",
        "Parasite",
        "Oppenheimer",
        "Untouchable",
        "Coco",
        "Up",
        "Green Book",
        "Catch Me If You Can",
        "Monsters, Inc."
    ],
    "Imdb_rating": [
        9.3,
        9.0,
        8.7,
        8.5,
        8.6,
        8.5,
        8.4,
        8.3,
        8.2,
        8.1,
        8.1
    ]
}
df = pd.DataFrame(data)

In [5]:
Imdb_df=df

In [6]:
n = len(df)
random_pairs = []

for _ in range(50):
    random_indices = random.sample(range(n), 2)
    movie1 = df.loc[random_indices[0], 'movie']
    movie2 = df.loc[random_indices[1], 'movie']
    random_pairs.append((movie1, movie2))

# Create a new DataFrame to store the random pairs and their IMDb ratings
random_pairs_df = pd.DataFrame(random_pairs, columns=['Left Movie', 'Right Movie'])

# Add columns for IMDb ratings based on movie names
random_pairs_df['Left Movie IMDb Rating'] = random_pairs_df['Left Movie'].map(df.set_index('movie')['Imdb_rating'])
random_pairs_df['Right Movie IMDb Rating'] = random_pairs_df['Right Movie'].map(df.set_index('movie')['Imdb_rating'])


In [7]:
for index, row in random_pairs_df.iterrows():
    left_rating = row["Left Movie IMDb Rating"]
    right_rating = row["Right Movie IMDb Rating"]

    if left_rating > 0.4+ right_rating:
        random_pairs_df.at[index, "Preference"] = "left"
    elif right_rating > 0.4 + left_rating:
        random_pairs_df.at[index, "Preference"] = "right"
    else:
        random_choice = random.choice(["left", "right"])
        random_pairs_df.at[index, "Preference"] = random_choice

In [8]:
random_pairs_df.head()

Unnamed: 0,Left Movie,Right Movie,Left Movie IMDb Rating,Right Movie IMDb Rating,Preference
0,Up,Interstellar,8.3,8.7,left
1,Coco,Up,8.4,8.3,right
2,Parasite,"Monsters, Inc.",8.5,8.1,left
3,Oppenheimer,Parasite,8.6,8.5,left
4,Schindler's List,"Monsters, Inc.",9.0,8.1,left


In [9]:
from crowdkit.aggregation import BradleyTerry
from sklearn.preprocessing import LabelEncoder


In [10]:
predict_df = random_pairs_df

In [11]:
predict_df.rename(columns={'Left Movie': 'left', 'Right Movie': 'right', 'Preference': 'label'}, inplace=True)

In [12]:
columns_to_delete = ["Left Movie IMDb Rating", "Right Movie IMDb Rating"]
predict_df = predict_df.drop(columns=columns_to_delete)

In [13]:
predict_df.head()

Unnamed: 0,left,right,label
0,Up,Interstellar,left
1,Coco,Up,right
2,Parasite,"Monsters, Inc.",left
3,Oppenheimer,Parasite,left
4,Schindler's List,"Monsters, Inc.",left


In [14]:
predict_df['label'] = predict_df.apply(lambda row: row['right'] if row['label'] == 'right' else row['label'], axis=1)
predict_df['label'] = predict_df.apply(lambda row: row['left'] if row['label'] == 'left' else row['label'], axis=1)


In [16]:
df=random_pairs_df

In [64]:
agg_bt = BradleyTerry(n_iter=1000).fit_predict(predict_df)

In [39]:

# Assuming you have the computed aggregated scores as a Series
agg_scores = pd.Series({
    "Catch Me If You Can": 0.012662,
    "Coco": 0.037496,
    "Green Book": 0.067802,
    "Interstellar": 0.000001,
    "Monsters, Inc.": 0.000000,
    "Oppenheimer": 0.107784,
    "Parasite": 0.052680,
    "Schindler's List": 0.255998,
    "The Shawshank Redemption": 0.240784,
    "Untouchable": 0.128138,
    "Up": 0.096657
}, name='Aggregated Score')

# Convert the Series to a DataFrame and reset the index
df = agg_scores.reset_index()

# Rename the columns
df.columns = ['Movie', 'Aggregated Score']


In [40]:
df

Unnamed: 0,Movie,Aggregated Score
0,Catch Me If You Can,0.012662
1,Coco,0.037496
2,Green Book,0.067802
3,Interstellar,1e-06
4,"Monsters, Inc.",0.0
5,Oppenheimer,0.107784
6,Parasite,0.05268
7,Schindler's List,0.255998
8,The Shawshank Redemption,0.240784
9,Untouchable,0.128138


In [42]:
from scipy.stats import spearmanr

# IMDb ranking for the movies
imdb_ranking = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

# Obtained ranking
obtained_ranking = [2, 1, 5, 6, 7, 4,  9, 8, 10, 3, 11]

# Calculate Spearman's rank correlation coefficient
rho, _ = spearmanr(imdb_ranking, obtained_ranking)

print(f"Spearman's ρ: {rho}")


Spearman's ρ: 0.6727272727272727


In [22]:
est = pd.DataFrame(agg_bt)

In [34]:
import pandas as pd
from scipy.stats import percentileofscore
import numpy as np
from sklearn.utils import resample

agg_bt = BradleyTerry(n_iter=1000).fit_predict(predict_df)

n_bootstraps = 1000

movies = predict_df['left'].unique()

# Create an empty DataFrame to store bootstrap results
bootstrap_scores = pd.DataFrame(columns=movies, index=range(n_bootstraps))

# Bootstrap resampling and score calculation
for i in range(n_bootstraps):
    # Resample with replacement
    sampled_data = resample(predict_df, replace=True, random_state=i)

    # Fit the Bradley-Terry model to the resampled data
    bootstrap_scores.loc[i] = BradleyTerry(n_iter=1000).fit_predict(sampled_data)

# Calculate confidence intervals
confidence_intervals = []
for movie in movies:
    scores = bootstrap_scores[movie]
    lower_percentile = np.percentile(scores, 2.5)
    upper_percentile = np.percentile(scores, 97.5)
    confidence_intervals.append((movie, lower_percentile, upper_percentile))

# Display confidence intervals
for movie, lower, upper in confidence_intervals:
    print(f"Movie: {movie}, Confidence Interval: ({lower:.4f}, {upper:.4f})")


Movie: Up, Confidence Interval: (0.0916, 0.2481)
Movie: Coco, Confidence Interval: (0.0000, 0.1321)
Movie: Parasite, Confidence Interval: (0.0000, 0.0811)
Movie: Oppenheimer, Confidence Interval: (0.0310, 0.1604)
Movie: Schindler's List, Confidence Interval: (0.1417, 0.2610)
Movie: Monsters, Inc., Confidence Interval: (0.0000, 0.1256)
Movie: Untouchable, Confidence Interval: (0.0000, 0.1187)
Movie: Green Book, Confidence Interval: (0.0000, 0.0757)
Movie: Catch Me If You Can, Confidence Interval: (0.0000, 0.0000)
Movie: Interstellar, Confidence Interval: (0.0290, 0.2533)
Movie: The Shawshank Redemption, Confidence Interval: (0.1161, 0.2438)
