In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



# Read the CSV file
file_path = './preped.csv'
df = pd.read_csv(file_path)



In [None]:
df.columns

In [None]:
df['Release Year'] = pd.to_datetime(df['Release Date'],unit='s').dt.year
df['Release Month'] = pd.to_datetime(df['Release Date'],unit='s').dt.month
df['Release Month']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define the features and target variable
features = df.select_dtypes(include=[int, float]).drop(columns=['Minimum Age'])
target = df['Minimum Age']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)


# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:

importances = clf.feature_importances_
sns.barplot(x=importances, y=features.columns)

In [None]:
# Define the threshold for low importance
threshold = 0.01

# Filter out features with importance below the threshold
important_features = features.columns[importances > threshold]

# Update the features dataframe to include only important features
features_important = features[important_features]

# Split the data into training and testing sets with important features
X_train_important, X_test_important, y_train, y_test = train_test_split(features_important, target, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf_important = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_important.fit(X_train_important, y_train)

# Make predictions
y_pred_important = clf_important.predict(X_test_important)

# Calculate the accuracy
accuracy_important = accuracy_score(y_test, y_pred_important)
print(f'Accuracy with important features: {accuracy_important:.2f}')

In [None]:

importances = clf_important.feature_importances_
sns.barplot(x=importances, y=features_important.columns)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Define the features and target variable for 'Is Series'
features_is_series = df.select_dtypes(include=[int, float]).drop(columns=['Is Series'])
target_is_series = df['Is Series']

# Split the data into training and testing sets
X_train_is_series, X_test_is_series, y_train_is_series, y_test_is_series = train_test_split(features_is_series, target_is_series, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf_is_series = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_is_series.fit(X_train_is_series, y_train_is_series)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(clf_is_series, feature_names=features_is_series.columns, class_names=['Not Series', 'Series'], filled=True)
plt.show()

In [None]:

importances = clf_is_series.feature_importances_
sns.barplot(x=importances, y=features_is_series.columns)



In [None]:

# Make predictions
y_pred_is_series = clf_is_series.predict(X_test_is_series)

# Calculate the accuracy
accuracy_is_series = accuracy_score(y_test_is_series, y_pred_is_series)
print(f'Accuracy: {accuracy_is_series:.2f}')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Simulated DataFrame with placeholder data
data = {
    'Title': ['Movie1', 'Movie2', 'Movie3', 'Movie4', 'Movie5'],
    'Genre': ['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi'],
    'Is Series': [0, 0, 0, 1, 1],
    'Hidden Gem Score': [7.8, 6.5, 8.0, 7.2, 9.1],
    'Runtime': [120, 90, 150, 110, 95],
    'IMDb Score': [8.1, 7.3, 8.5, 7.8, 9.0],
    'Rotten Tomatoes Score': [88, 72, 91, 85, 94],
    'Metacritic Score': [75, 60, 80, 70, 85],
    'Awards Received': [3, 1, 5, 2, 8],
    'Awards Nominated For': [5, 3, 8, 4, 12],
    'Boxoffice': [100000000, 50000000, 150000000, 70000000, 200000000],
    'Release Date': ['2020-01-01', '2019-05-10', '2018-07-15', '2021-10-30', '2022-08-20'],
    'IMDb Votes': [10000, 5000, 20000, 8000, 30000],
    'Minimum Age': [13, 13, 16, 18, 16],
    'Action': [1, 0, 0, 0, 1],
    'Adventure': [1, 0, 0, 0, 1],
    'Animation': [0, 0, 0, 0, 0],
    'Biography': [0, 0, 1, 0, 0],
    'Comedy': [0, 1, 0, 0, 0],
    'Crime': [0, 0, 0, 0, 0],
    'Documentary': [0, 0, 0, 0, 0],
    'Drama': [0, 0, 1, 0, 0],
    'Family': [0, 0, 0, 0, 0],
    'Fantasy': [0, 0, 0, 0, 1],
    'History': [0, 0, 0, 0, 0],
    'Horror': [0, 0, 0, 1, 0],
    'Music': [0, 0, 0, 0, 0],
    'Musical': [0, 0, 0, 0, 0],
    'Mystery': [0, 0, 0, 0, 0],
    'News': [0, 0, 0, 0, 0],
    'Romance': [0, 0, 0, 0, 0],
    'Sci-Fi': [0, 0, 0, 0, 1],
    'Sport': [0, 0, 0, 0, 0],
    'Thriller': [0, 0, 0, 1, 0],
    'War': [0, 0, 0, 0, 0],
    'Western': [0, 0, 0, 0, 0],
}

df = pd.DataFrame(data)

# Target variable: Binary classification (e.g., Hidden Gem Score > 8)
df['Hidden_Gem_Target'] = (df['Hidden Gem Score'] > 8).astype(int)

# Preprocessing
label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Feature-target split
X = df.drop(columns=['Title', 'Release Date', 'Hidden Gem Score', 'Hidden_Gem_Target'])
y = df['Hidden_Gem_Target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:\n", importance_df)
