In [2]:
import pandas as pd

# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
file_path = "/content/drive/My Drive/Final_Dataset_Practice.csv"
df = pd.read_csv(file_path)

# Display first few rows
df.head()


Mounted at /content/drive


Unnamed: 0,Age,Gender,Occupation,Location,SpendTime,WatchTime,PlayingType,GamingPlatform,ReduceAnxiety,StrategicThinking,MentalHealth,SocialPressure,EarningMoney,GamingEvent,SpentMoney,SustainableIncomeSource,GamingPopularity,RecommendCareer,ProfitableBusiness,PromoteTourism
0,18-25,Male,Student,Rajshahi,1-2 hours,Daily,Action,Mobile,Slightly,Yes,No,Yes,Yes,Yes,No,Yes,Yes,Maybe,Maybe,Yes
1,18-25,Male,Student,Dhaka,Less than 1 hour,Weekly,Action,Mobile,Slightly,No,Neutral,No,No,No,Yes,Yes,Yes,No,No,Yes
2,18-25,Male,Student,Sylhet,Less than 1 hour,Rarely,Sports,Mobile,Slightly,Not Sure,Neutral,No,No,No,No,Maybe,Yes,No,Maybe,No
3,Under 18,Male,Student,Rajshahi,Less than 1 hour,Rarely,Other,Mobile,Not at all,No,No,Yes,No,No,No,Maybe,Yes,No,Maybe,Not Sure
4,18-25,Male,Student,Dhaka,Less than 1 hour,Rarely,Action,Mobile,Not at all,Yes,Yes,No,No,No,No,Maybe,Yes,No,Yes,Yes


Explore and Clean the Dataset

In [None]:
import pandas as pd
import numpy as np

# Check for missing values
print(df.isnull().sum())

# Fill missing values
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)  # Numeric columns
df.fillna(df.select_dtypes(include=[object]).mode().iloc[0], inplace=True)  # Categorical columns

# Check data types
print(df.dtypes)

# Convert categorical variables to numeric
df = pd.get_dummies(df, drop_first=True)

# Display cleaned dataset
print(df.head())


Age                        0
Gender                     0
Occupation                 0
Location                   0
SpendTime                  0
WatchTime                  0
PlayingType                0
GamingPlatform             0
ReduceAnxiety              0
StrategicThinking          0
MentalHealth               0
SocialPressure             0
EarningMoney               0
GamingEvent                0
SpentMoney                 0
SustainableIncomeSource    2
GamingPopularity           0
RecommendCareer            2
ProfitableBusiness         1
PromoteTourism             1
dtype: int64
Age                        object
Gender                     object
Occupation                 object
Location                   object
SpendTime                  object
WatchTime                  object
PlayingType                object
GamingPlatform             object
ReduceAnxiety              object
StrategicThinking          object
MentalHealth               object
SocialPressure             object

Feature Selection & Data Preprocessing

In [None]:
print(df.columns)


Index(['Age_26-35', 'Age_Under 18', 'Gender_Male', 'Occupation_Employed',
       'Occupation_Freelancer', 'Occupation_Student', 'Occupation_Unemployed',
       'Location_Dhaka', 'Location_Khulna', 'Location_Mymensingh',
       'Location_Rajshahi', 'Location_Rangpur', 'Location_Sylhet',
       'SpendTime_2-4 hours', 'SpendTime_Less than 1 hour',
       'SpendTime_More than 4 hours', 'WatchTime_Monthly', 'WatchTime_Rarely',
       'WatchTime_Weekly', 'PlayingType_Other', 'PlayingType_Puzzle',
       'PlayingType_RPG', 'PlayingType_Sports', 'PlayingType_Strategy',
       'GamingPlatform_Mobile', 'GamingPlatform_Other', 'GamingPlatform_PC',
       'ReduceAnxiety_Not at all', 'ReduceAnxiety_Significantly',
       'ReduceAnxiety_Slightly', 'StrategicThinking_Not Sure',
       'StrategicThinking_Yes', 'MentalHealth_No', 'MentalHealth_Yes',
       'SocialPressure_No', 'SocialPressure_Yes', 'EarningMoney_Yes',
       'GamingEvent_Yes', 'SpentMoney_Yes', 'SustainableIncomeSource_No',
       'Sus

Data preprocessing completed successfully!


Random Forest model for forecasting:

Model Optimization

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


Save and Deploy the Model

Visualize Trends

Sentiment Analysis

Predicting Player Behavior and Mental Health

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import accuracy_score, mean_absolute_error

# Reinitialize dataset
np.random.seed(42)

occupations = ['Student', 'Freelancer', 'Employed', 'Businessperson', 'Unemployed']
genders = ['Male', 'Female']
genres = ['FPS', 'RPG', 'Strategy', 'MOBA', 'Casual', 'Sports']
platforms = ['Mobile', 'PC', 'Console']

num_samples = 500

gaming_df = pd.DataFrame({
    "ID": np.arange(1, num_samples + 1),
    "Age": np.random.randint(14, 45, num_samples),
    "Occupation": np.random.choice(occupations, num_samples),
    "Gender": np.random.choice(genders, num_samples),
    "Gaming Hours": np.random.randint(1, 10, num_samples),
    "Favorite Genre": np.random.choice(genres, num_samples),
    "Spent Money (BDT)": np.random.randint(0, 20000, num_samples),
    "Earned Money (BDT)": np.random.randint(0, 50000, num_samples),
    "Stress Reduction": np.random.choice(["Yes", "No"], num_samples, p=[0.7, 0.3]),
    "Career Interest (Yes/No)": np.random.choice(["Yes", "No"], num_samples, p=[0.4, 0.6]),
    "Social Pressure Level": np.random.choice(["Low", "Medium", "High"], num_samples),
    "Preferred Platform": np.random.choice(platforms, num_samples),
})

# 1. Predicting Gaming Market Trends (ARIMA)
gaming_trend = gaming_df.groupby('Age')['Spent Money (BDT)'].sum().reset_index()
gaming_trend.columns = ['ds', 'y']

# ARIMA Model
arima_model = ARIMA(gaming_trend['y'], order=(2,1,2))
arima_fit = arima_model.fit()
arima_pred = arima_fit.forecast(steps=5)

# 2. Player Behavior Analysis (K-Means Clustering)
X = gaming_df[['Gaming Hours', 'Spent Money (BDT)', 'Earned Money (BDT)']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42)
gaming_df['Cluster'] = kmeans.fit_predict(X_scaled)

# 3. Spending Pattern Prediction (Random Forest)
X_train, X_test, y_train, y_test = train_test_split(X, gaming_df['Spent Money (BDT)'], test_size=0.2, random_state=42)
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)

# 5. Gaming Addiction Detection (Classification using SVM & Neural Networks)
y_addiction = (gaming_df['Gaming Hours'] > 5).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y_addiction, test_size=0.2, random_state=42)

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)

nn_model = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=500)
nn_model.fit(X_train, y_train)
nn_pred = nn_model.predict(X_test)
nn_acc = accuracy_score(y_test, nn_pred)

# 7. Esports Performance Prediction (Linear Regression)
X_train, X_test, y_train, y_test = train_test_split(X, gaming_df['Earned Money (BDT)'], test_size=0.2, random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mae = mean_absolute_error(y_test, lr_pred)

# Prepare Comparison Table
comparison_results_final = pd.DataFrame({
    "Model": ["ARIMA", "K-Means", "Random Forest", "SVM", "Neural Network", "Linear Regression"],
    "Metric": ["Forecasting", "Clustering", "Prediction (MAE)", "Classification (Acc)", "Classification (Acc)", "Regression (MAE)"],
    "Result": [arima_pred.mean(), "3 Clusters", rf_mae, svm_acc, nn_acc, lr_mae]
})

# Display final results correctly without external dependencies

import pandas as pd

# Displaying the final results as a DataFrame output for the user
comparison_results_final



  warn('Non-stationary starting autoregressive parameters'


Unnamed: 0,Model,Metric,Result
0,ARIMA,Forecasting,159433.796879
1,K-Means,Clustering,3 Clusters
2,Random Forest,Prediction (MAE),24.925
3,SVM,Classification (Acc),0.52
4,Neural Network,Classification (Acc),0.59
5,Linear Regression,Regression (MAE),0.0


In [None]:
from xgboost import XGBClassifier

# Use XGBoost Classifier
model = XGBClassifier(random_state=42)

# Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

AttributeError: 'super' object has no attribute '__sklearn_tags__'