# Decision Tree with MovieLens Dataset

MovieLens dataset ကို အသုံးပြုပြီး Decision Tree Classifier နဲ့ ရုပ်ရှင် rating ခန့်မှန်းခြင်း

**Dataset**: https://www.kaggle.com/datasets/ayushimishra2809/movielens-dataset

**Goal**: User ရဲ့ rating behavior ကို predict လုပ်မယ် (High/Low rating)

In [None]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data Load
# Kaggle မှ download ရယူပြီး data folder ထဲတွင် ထားပါ
DATA_PATH = '/kaggle/input/movielens-dataset/'

# Load ratings and movies
ratings = pd.read_csv(f'{DATA_PATH}ratings.csv')
movies = pd.read_csv(f'{DATA_PATH}movies.csv')

print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("\nRatings columns:", ratings.columns.tolist())
print("Movies columns:", movies.columns.tolist())

In [None]:
# Data Preview
print("=== Ratings Data ===")
display(ratings.head())

print("\n=== Movies Data ===")
display(movies.head())

In [None]:
# Merge ratings with movies
df = ratings.merge(movies, on='movieId')
print("Merged data shape:", df.shape)
display(df.head())

In [None]:
# Feature Engineering
# Genre ကို one-hot encoding လုပ်မယ်
genres = df['genres'].str.get_dummies(sep='|')
print("Genre columns:", genres.columns.tolist())

# User statistics
user_stats = df.groupby('userId').agg({
    'rating': ['mean', 'count', 'std']
}).reset_index()
user_stats.columns = ['userId', 'user_avg_rating', 'user_rating_count', 'user_rating_std']
user_stats['user_rating_std'] = user_stats['user_rating_std'].fillna(0)

# Movie statistics
movie_stats = df.groupby('movieId').agg({
    'rating': ['mean', 'count']
}).reset_index()
movie_stats.columns = ['movieId', 'movie_avg_rating', 'movie_rating_count']

print("\nUser stats shape:", user_stats.shape)
print("Movie stats shape:", movie_stats.shape)

In [None]:
# Final Dataset ပြင်ဆင်ခြင်း
df_final = df.merge(user_stats, on='userId').merge(movie_stats, on='movieId')
df_final = pd.concat([df_final, genres], axis=1)

# Binary Classification: High Rating (>= 4) vs Low Rating (< 4)
df_final['high_rating'] = (df_final['rating'] >= 4).astype(int)

print("Final dataset shape:", df_final.shape)
print("\nTarget distribution:")
print(df_final['high_rating'].value_counts())

In [None]:
# Features and Target ခွဲခြားခြင်း
feature_cols = ['user_avg_rating', 'user_rating_count', 'user_rating_std',
                'movie_avg_rating', 'movie_rating_count'] + genres.columns.tolist()

X = df_final[feature_cols]
y = df_final['high_rating']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature columns:", feature_cols[:5], "... +", len(genres.columns), "genre columns")

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

In [None]:
# Decision Tree Model
dt_model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

dt_model.fit(X_train, y_train)
print("Model training complete!")

In [None]:
# Prediction and Evaluation
y_pred = dt_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=['Low Rating', 'High Rating']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(cm, cmap='Blues')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.xticks([0, 1], ['Low', 'High'])
plt.yticks([0, 1], ['Low', 'High'])
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha='center', va='center', fontsize=16)
plt.show()

In [None]:
# Decision Tree Visualization
plt.figure(figsize=(20, 10))
plot_tree(dt_model, 
          feature_names=feature_cols, 
          class_names=['Low', 'High'],
          filled=True,
          rounded=True,
          fontsize=8)
plt.title('Decision Tree Visualization')
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'][:10], importance['importance'][:10])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
display(importance.head(10))