In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
file_path = './archive/netflix-rotten-tomatoes-metacritic-imdb.csv'
df = pd.read_csv(file_path)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure with multiple subplots for distribution analysis
plt.figure(figsize=(15, 10))

# Plot 1: Distribution of IMDb Scores
plt.subplot(2, 2, 1)
sns.histplot(data=df, x='IMDb Score', kde=True)
plt.title('Distribution of IMDb Scores')

# Plot 2: Distribution of Rotten Tomatoes Scores
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='Rotten Tomatoes Score', kde=True)
plt.title('Distribution of Rotten Tomatoes Scores')

# Plot 3: Distribution of Metacritic Score
plt.subplot(2, 2, 3)
sns.histplot(data=df, x='Metacritic Score', kde=True)
plt.title('Distribution of Metacritic Score')

# Plot 4: Box plot of all scores
plt.subplot(2, 2, 4)
scores_df = df[['IMDb Score', 'Rotten Tomatoes Score', 'Metacritic Score']]
# Scale IMDb scores to 0-100 to match other scores
scores_df['IMDb Score'] = scores_df['IMDb Score'] * 10
sns.boxplot(data=scores_df)
plt.title('Box Plot of All Scores (0-100)')

plt.tight_layout()
plt.show()

In [None]:
df['View Rating'] = df['View Rating'].fillna('Unrated')
view_rating_to_age = {
    'G': 0,
    'E10+': 10,
    'TV-Y7-FV': 7,
    'PG': 10,
    'PG-13': 13,
    'R': 17,
    'NC-17': 17,
    'MA-17': 17,
    'Approved': 0,
    'Passed': 0,
    'Unrated': 0,
    'UNRATED': 0,
    'E': 0,
    'Not Rated': 0,
    'NOTRATED': 0,
    'NOT RATED': 0,
    'TV-Y': 0,
    'TV-Y7': 7,
    'TV-G': 0,
    'AL': 0,
    'GP': 0,
    'TV-PG': 10,
    'TV-14': 14,
    'M/PG': 15,
    '15': 15,
    'U': 0,
    'M': 15,
    'TV-MA': 17,
    'X': 18,
    'TV-13': 13,
}

df['Minimum Age'] = df['View Rating'].replace(view_rating_to_age)

# Check how many titles have age rating 18 or 15
age_18_count = (df['Minimum Age'] == 18).sum()
age_15_count = (df['Minimum Age'] == 15).sum()

print(f"Number of titles with age rating 18: {age_18_count}")
print(f"Number of titles with age rating 15: {age_15_count}")

# Convert 'View Rating' to minimum age
df = df.drop('View Rating', axis=1)


In [None]:
plt.figure(figsize=(12, 6))
age_counts = df['Minimum Age'].value_counts().sort_index()
sns.barplot(x=age_counts.index, y=age_counts.values)
plt.title('Distribution of Minimum Age Ratings')
plt.xlabel('Minimum Age')
plt.ylabel('Number of Titles')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Print the first column of the DataFrame
print(df.iloc[:, 0])

In [None]:
# Get a summary of statistics for numerical columns
df.describe()


In [None]:

# Get a concise summary of the DataFrame
df.info()

In [None]:
import matplotlib.pyplot as plt

# Plot IMDb Score
df['IMDb Score'].plot(kind='line', title='Distribution of IMDb Scores')
plt.ylabel('IMDb Score')
plt.xlabel('Movie Index')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot IMDb Votes of every movie
plt.figure(figsize=(12, 6))
df['IMDb Votes'].plot(kind='line', title='IMDb Votes of Every Movie')
plt.xlabel('Movie Index')
plt.ylabel('IMDb Votes')
plt.show()

In [None]:
# Plot correlation between IMDb rating and votes
plt.scatter(df['IMDb Score'], df['IMDb Votes'])
plt.title('Correlation between IMDb Rating and Votes')
plt.xlabel('IMDb Rating')
plt.ylabel('Number of Votes')
plt.show()

In [None]:
import seaborn as sns
# Filter out non-numerical values
df_numerical = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
corr = df_numerical.corr()

# Generate a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Values')
plt.show()

In [None]:
# Sort the DataFrame by IMDb Score in descending order and print the top 3 rows
top_3_imdb = df.sort_values(by='IMDb Score', ascending=False).head(3)
print("Top 3 movies by IMDb Score:")
print(top_3_imdb[['Title', 'IMDb Score']])

# Sort the DataFrame by IMDb Score in ascending order and print the bottom 3 rows
bottom_3_imdb = df.sort_values(by='IMDb Score', ascending=True).head(3)
print("\nBottom 3 movies by IMDb Score:")
print(bottom_3_imdb[['Title', 'IMDb Score']])

In [None]:
# Convert the DataFrame to a NumPy array
numpy_array = df.to_numpy()
numpy_array

In [None]:
import numpy as np

nparr = np.genfromtxt(file_path, delimiter=',', dtype=str, skip_header=1)
print(nparr)

In [None]:
import numpy as np

nparr = np.genfromtxt(file_path, delimiter=',', dtype=str, skip_header=1, usecols=0)
print(nparr)