In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

In [3]:
df = df.dropna()

In [4]:
df.shape

(6825, 16)

In [5]:
df["Global_Sales"].min

<bound method Series.min of 0        82.53
2        35.52
3        32.77
6        29.80
7        28.92
         ...  
16667     0.01
16677     0.01
16696     0.01
16700     0.01
16706     0.01
Name: Global_Sales, Length: 6825, dtype: float64>

In [13]:
# New bins: breaking <1M into 100k intervals
bins = [0, 0.3, 0.6, 1, 5, 10, 20, 40]
labels = [
    '0-300K', '300K-600K', '600K-1M',
    '1-5M', '5-10M', '10-20M', '20M+'
]


# Creating the category column
df['Sales_Category'] = pd.cut(df['Global_Sales'], bins=bins, labels=labels, right=False)

# Check value counts
print(df['Sales_Category'].value_counts())


Sales_Category
0-300K       3438
300K-600K    1296
1-5M         1170
600K-1M       778
5-10M         103
10-20M         27
20M+           12
Name: count, dtype: int64


In [14]:
df['Sales_Category'] = pd.cut(df['Global_Sales'], bins=bins, labels=labels, right=False)
print(df['Sales_Category'].value_counts())


Sales_Category
0-300K       3438
300K-600K    1296
1-5M         1170
600K-1M       778
5-10M         103
10-20M         27
20M+           12
Name: count, dtype: int64


In [15]:
df = df[df['Sales_Category'].notna()]

In [16]:
df.columns

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Sales_Category'],
      dtype='object')

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Drop unwanted columns
cols_to_drop = [
    'Global_Sales',   # original target
    'NA_Sales',       # regional leak
    'EU_Sales',       # regional leak
    'JP_Sales',       # regional leak
    'Other_Sales',    # regional leak
]
df_prep = df.drop(columns=cols_to_drop)

# 2. One-Hot Encode categorical columns
categorical_cols = ['Platform', 'Genre', 'Rating', 'Publisher', 'Developer']
df_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)

# 3. Define X and y for classification
X = df_encoded.drop(columns=['Name','Sales_Category'], axis=1)
y = df_encoded['Sales_Category']

# 4. Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Check shapes
print("X_train:", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train:", y_train.shape)
print("y_test: ", y_test.shape)


X_train: (5459, 1588)
X_test:  (1365, 1588)
y_train: (5459,)
y_test:  (1365,)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
log_reg = LogisticRegression(max_iter=1000)  # max_iter increased to ensure convergence

# Train the model
log_reg.fit(X_train, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

      0-300K       0.57      0.95      0.72       664
        1-5M       0.42      0.44      0.43       236
      10-20M       0.00      0.00      0.00         9
        20M+       0.00      0.00      0.00         1
   300K-600K       0.00      0.00      0.00       273
       5-10M       0.30      0.14      0.19        21
     600K-1M       0.00      0.00      0.00       161

    accuracy                           0.54      1365
   macro avg       0.18      0.22      0.19      1365
weighted avg       0.36      0.54      0.43      1365

Confusion Matrix:
 [[632  32   0   0   0   0   0]
 [126 105   1   0   0   4   0]
 [  2   4   0   0   0   3   0]
 [  1   0   0   0   0   0   0]
 [222  51   0   0   0   0   0]
 [  5  12   1   0   0   3   0]
 [113  48   0   0   0   0   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n=== {name} ===")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    acc = accuracy_score(y_test, y_pred)
    print(" Accuracy:", acc)



=== Decision Tree ===
Classification Report:
              precision    recall  f1-score   support

      0-300K       0.73      0.74      0.73       664
        1-5M       0.53      0.54      0.54       236
      10-20M       0.50      0.33      0.40         9
        20M+       0.00      0.00      0.00         1
   300K-600K       0.29      0.29      0.29       273
       5-10M       0.37      0.33      0.35        21
     600K-1M       0.27      0.25      0.26       161

    accuracy                           0.55      1365
   macro avg       0.38      0.36      0.37      1365
weighted avg       0.54      0.55      0.55      1365

Confusion Matrix:
[[491  33   0   0 110   0  30]
 [ 31 128   2   1  34   8  32]
 [  0   3   3   0   1   1   1]
 [  0   0   0   0   0   0   1]
 [110  40   0   0  80   1  42]
 [  1   7   1   1   2   7   2]
 [ 42  31   0   0  46   2  40]]
 Accuracy: 0.5487179487179488

=== Random Forest ===
Classification Report:
              precision    recall  f1-score  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Create the model (you can try different k values like 3, 5, 7)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

# 2. Train it
knn.fit(X_train, y_train)

# 3. Predict
y_pred_knn = knn.predict(X_test)

# 4. Evaluate
print("=== K-Nearest Neighbors ===")
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", acc)


=== K-Nearest Neighbors ===
Classification Report:
              precision    recall  f1-score   support

      0-300K       0.60      0.75      0.67       664
        1-5M       0.37      0.39      0.38       236
      10-20M       0.00      0.00      0.00         9
        20M+       0.00      0.00      0.00         1
   300K-600K       0.26      0.17      0.21       273
       5-10M       0.20      0.10      0.13        21
     600K-1M       0.15      0.09      0.11       161

    accuracy                           0.48      1365
   macro avg       0.23      0.21      0.21      1365
weighted avg       0.43      0.48      0.45      1365

Confusion Matrix:
[[500  66   1   0  68   2  27]
 [ 79  91   0   0  34   5  27]
 [  0   5   0   0   1   1   2]
 [  0   0   0   0   1   0   0]
 [164  35   1   0  47   0  26]
 [  7   7   0   0   1   2   4]
 [ 77  40   0   0  29   0  15]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNN Accuracy: 0.49743589743589745
