In [46]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file
file_path = '../preped.csv'
df = pd.read_csv(file_path)

In [None]:
df.columns

In [None]:
df['Release Year'] = pd.to_datetime(df['Release Date'],unit='s').dt.year
df['Release Month'] = pd.to_datetime(df['Release Date'],unit='s').dt.month
df['Release Month']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define the features and target variable
features = df.select_dtypes(include=[int, float]).drop(columns=['Minimum Age'])
target = df['Minimum Age']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)


# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(features.shape[1]), importances[indices], align="center")
plt.xticks(range(features.shape[1]), [features.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()

In [None]:
# Define the threshold for low importance
threshold = 0.01

# Filter out features with importance below the threshold
important_features = features.columns[importances > threshold]

# Update the features dataframe to include only important features
features_important = features[important_features]

# Split the data into training and testing sets with important features
X_train_important, X_test_important, y_train, y_test = train_test_split(features_important, target, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf_important = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_important.fit(X_train_important, y_train)

# Make predictions
y_pred_important = clf_important.predict(X_test_important)

# Calculate the accuracy
accuracy_important = accuracy_score(y_test, y_pred_important)
print(f'Accuracy with important features: {accuracy_important:.2f}')

In [None]:
import matplotlib.pyplot as plt

importances = clf_important.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances for 'Is Series'")
plt.bar(range(features_important.shape[1]), importances[indices], align="center")
plt.xticks(range(features_important.shape[1]), [features_important.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features_important.shape[1]])
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Define the features and target variable for 'Is Series'
features_is_series = df.select_dtypes(include=[int, float]).drop(columns=['Is Series'])
target_is_series = df['Is Series']

# Split the data into training and testing sets
X_train_is_series, X_test_is_series, y_train_is_series, y_test_is_series = train_test_split(features_is_series, target_is_series, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf_is_series = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_is_series.fit(X_train_is_series, y_train_is_series)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(clf_is_series, feature_names=features_is_series.columns, class_names=['Not Series', 'Series'], filled=True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

importances = clf_is_series.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances for 'Is Series'")
plt.bar(range(features_is_series.shape[1]), importances[indices], align="center")
plt.xticks(range(features_is_series.shape[1]), [features_is_series.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features_is_series.shape[1]])
plt.show()


In [None]:
# Make predictions
y_pred_is_series = clf_is_series.predict(X_test_is_series)

# Calculate the accuracy
accuracy_is_series = accuracy_score(y_test_is_series, y_pred_is_series)
print(f'Accuracy: {accuracy_is_series:.2f}')