In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score



In [2]:
# Load dataset with explicit encoding to handle special characters
df = pd.read_csv("anxiety.csv", encoding='latin1')

In [3]:
df.isnull().sum()

S. No.                 0
Timestamp              0
GAD1                   0
GAD2                   0
GAD3                   0
GAD4                   0
GAD5                   0
GAD6                   0
GAD7                   0
GADE                 649
SWL1                   0
SWL2                   0
SWL3                   0
SWL4                   0
SWL5                   0
Game                   0
Platform               0
Hours                 30
earnings               0
whyplay                0
League              1852
highestleague      13464
streams              100
SPIN1                124
SPIN2                154
SPIN3                140
SPIN4                159
SPIN5                166
SPIN6                156
SPIN7                138
SPIN8                144
SPIN9                158
SPIN10               160
SPIN11               187
SPIN12               168
SPIN13               187
SPIN14               156
SPIN15               147
SPIN16               147
SPIN17               175


In [4]:
# === Step 1: Drop columns with too many missing values (optional threshold) ===
threshold = 0.6  # Drop columns with >60% missing
missing_ratio = df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index.tolist()
df.drop(columns=cols_to_drop, inplace=True)
print(f"üßπ Dropped columns with excessive missing values: {cols_to_drop}")

# === Step 2: Fill remaining missing values ===
# For categorical columns ‚Üí fill with 'Unknown'
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna('Unknown')

# For numeric columns ‚Üí fill with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

print("‚úÖ Missing values handled successfully.")

üßπ Dropped columns with excessive missing values: ['highestleague']
‚úÖ Missing values handled successfully.


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13464 entries, 0 to 13463
Data columns (total 54 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   S. No.           13464 non-null  int64  
 1   Timestamp        13464 non-null  float64
 2   GAD1             13464 non-null  int64  
 3   GAD2             13464 non-null  int64  
 4   GAD3             13464 non-null  int64  
 5   GAD4             13464 non-null  int64  
 6   GAD5             13464 non-null  int64  
 7   GAD6             13464 non-null  int64  
 8   GAD7             13464 non-null  int64  
 9   GADE             13464 non-null  object 
 10  SWL1             13464 non-null  int64  
 11  SWL2             13464 non-null  int64  
 12  SWL3             13464 non-null  int64  
 13  SWL4             13464 non-null  int64  
 14  SWL5             13464 non-null  int64  
 15  Game             13464 non-null  object 
 16  Platform         13464 non-null  object 
 17  Hours       

In [6]:
df.isnull().sum()

S. No.             0
Timestamp          0
GAD1               0
GAD2               0
GAD3               0
GAD4               0
GAD5               0
GAD6               0
GAD7               0
GADE               0
SWL1               0
SWL2               0
SWL3               0
SWL4               0
SWL5               0
Game               0
Platform           0
Hours              0
earnings           0
whyplay            0
League             0
streams            0
SPIN1              0
SPIN2              0
SPIN3              0
SPIN4              0
SPIN5              0
SPIN6              0
SPIN7              0
SPIN8              0
SPIN9              0
SPIN10             0
SPIN11             0
SPIN12             0
SPIN13             0
SPIN14             0
SPIN15             0
SPIN16             0
SPIN17             0
Narcissism         0
Gender             0
Age                0
Work               0
Degree             0
Birthplace         0
Residence          0
Reference          0
Playstyle    

In [7]:
# Drop irrelevant columns
drop_cols = ['S. No.', 'Timestamp', 'GADE', 'Residence_ISO3', 'Birthplace_ISO3']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

In [8]:
df.head(5)

Unnamed: 0,GAD1,GAD2,GAD3,GAD4,GAD5,GAD6,GAD7,SWL1,SWL2,SWL3,...,Work,Degree,Birthplace,Residence,Reference,Playstyle,accept,GAD_T,SWL_T,SPIN_T
0,0,0,0,0,1,0,0,3,5,5,...,Unemployed / between jobs,Bachelor¬†(or equivalent),USA,USA,Reddit,Singleplayer,Accept,1,23,5.0
1,1,2,2,2,0,1,0,3,5,2,...,Unemployed / between jobs,Bachelor¬†(or equivalent),USA,USA,Reddit,Multiplayer - online - with strangers,Accept,8,16,33.0
2,0,2,2,0,0,3,1,2,6,5,...,Employed,Bachelor¬†(or equivalent),Germany,Germany,Reddit,Singleplayer,Accept,8,17,31.0
3,0,0,0,0,0,0,0,2,5,5,...,Employed,Bachelor¬†(or equivalent),USA,USA,Reddit,Multiplayer - online - with online acquaintanc...,Accept,0,17,11.0
4,2,1,2,2,2,3,2,2,2,4,...,Employed,High school diploma (or equivalent),USA,South Korea,Reddit,Multiplayer - online - with strangers,Accept,14,14,13.0


In [9]:
df.tail()

Unnamed: 0,GAD1,GAD2,GAD3,GAD4,GAD5,GAD6,GAD7,SWL1,SWL2,SWL3,...,Work,Degree,Birthplace,Residence,Reference,Playstyle,accept,GAD_T,SWL_T,SPIN_T
13459,1,0,0,1,0,1,1,6,6,5,...,Student at college / university,Master¬†(or equivalent),France,France,Reddit,Multiplayer - online - with strangers,Accept,4,28,7.0
13460,3,3,3,3,2,3,3,5,5,5,...,Student at college / university,High school diploma (or equivalent),USA,USA,Reddit,Multiplayer - online - with strangers,Accept,20,23,25.0
13461,0,0,0,0,0,0,0,6,6,7,...,Student at college / university,High school diploma (or equivalent),Norway,Norway,Reddit,Multiplayer - online - with real life friends,Accept,0,32,10.0
13462,3,2,1,3,0,1,3,2,6,3,...,Student at school,High school diploma (or equivalent),Canada,Canada,Reddit,Singleplayer,Accept,13,16,32.0
13463,1,1,0,0,0,0,0,5,6,5,...,Student at college / university,High school diploma (or equivalent),Canada,Canada,Reddit,Multiplayer - online - with strangers,Accept,2,25,14.0


In [10]:
# Define target: Anxiety if GAD_T >= 10
df['Anxiety_Label'] = df['GAD_T'].apply(lambda x: 1 if x >= 10 else 0)

In [11]:
# === Step 2: Encode Categorical Columns ===
categorical_cols = df.select_dtypes(include='object').columns.tolist()
encoders = {}
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown").astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


In [12]:
# === Step 3: Prepare Features and Target ===
X = df.drop(['GAD_T', 'Anxiety_Label'], axis=1)
y = df['Anxiety_Label']

In [13]:
# === Step 3: Validate and Prepare Data ===

# Check if df is loaded and not empty
if 'df' not in locals() or df is None or df.empty:
    raise ValueError("‚ùå The dataframe 'df' is empty or not loaded. Please load and preprocess the data correctly.")

# Check required columns exist
required_columns = ['GAD_T', 'Anxiety_Label']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"‚ùå Missing required columns: {missing_columns}. Please check your preprocessing.")

# Define features and target
X = df.drop(required_columns, axis=1, errors='ignore')
y = df['Anxiety_Label']

# Check if X and y are valid
if X.empty:
    raise ValueError("‚ùå Feature set (X) is empty. Please check your preprocessing.")
if y.empty:
    raise ValueError("‚ùå Target variable (y) is empty. Please check your preprocessing.")

# === Step 4: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# === Step 5: Train LightGBM Model ===
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1887, number of negative: 8884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011375 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1290
[LightGBM] [Info] Number of data points in the train set: 10771, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175193 -> initscore=-1.549264
[LightGBM] [Info] Start training from score -1.549264


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
# === Step 6: Evaluate Model ===
y_pred = model.predict(X_test)
print("\nüìä Model Evaluation:\n")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"‚úÖ Model Accuracy: {accuracy * 100:.2f}%")



üìä Model Evaluation:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2228
           1       0.98      0.98      0.98       465

    accuracy                           0.99      2693
   macro avg       0.99      0.99      0.99      2693
weighted avg       0.99      0.99      0.99      2693

‚úÖ Model Accuracy: 99.37%


In [16]:
def ML(input_df):
    print("üì• Inside ML(), input_df preview:\n", input_df.head())
    print("üìä Input shape:", input_df.shape)
    print("üìä Expected shape:", X.shape)
    
    input_df = input_df.copy()

    # Ensure all columns are numeric
    for col in input_df.columns:
        try:
            input_df[col] = pd.to_numeric(input_df[col], errors='coerce').fillna(0)
        except Exception as e:
            print(f"‚ö†Ô∏è Could not convert column '{col}' to numeric: {e}")
            input_df[col] = 0

    # Ensure the DataFrame has the same columns as training data
    if input_df.shape[1] != X.shape[1]:
        print(f"‚ùå Column count mismatch: {input_df.shape[1]} vs {X.shape[1]}")
        return None
    
    # Ensure column order matches training data
    input_df = input_df.reindex(columns=X.columns, fill_value=0)
    
    print("üì• Final processed input:\n", input_df.head())
    print("üìä Final shape:", input_df.shape)

    try:
        prediction = model.predict(input_df)
        print("üîÆ ML prediction output:", prediction)
        return prediction[0]  # Return scalar value
    except Exception as e:
        print(f"‚ùå Prediction failed: {e}")
        return None



In [17]:
def predict_from_input():
    # Create a more comprehensive user input with all required columns
    user_input = {
        'GAD1': 2, 'GAD2': 2, 'GAD3': 2, 'GAD4': 1, 'GAD5': 2, 'GAD6': 3, 'GAD7': 2,
        'SWL1': 3, 'SWL2': 4, 'SWL3': 5, 'SWL4': 4, 'SWL5': 3,
        'Platform': 'PC', 'Gender': 'Female', 'Age': 24, 'Work': 'Employed',
        'Degree': 'Bachelor (or equivalent)', 'Playstyle': 'Singleplayer',
        'SPIN1': 1, 'SPIN2': 2, 'SPIN3': 3, 'SPIN4': 2, 'SPIN5': 1,
        'SPIN6': 2, 'SPIN7': 1, 'SPIN8': 2, 'SPIN9': 3, 'SPIN10': 2,
        'SPIN11': 1, 'SPIN12': 2, 'SPIN13': 1, 'SPIN14': 2, 'SPIN15': 1,
        'SPIN16': 2, 'SPIN17': 1, 'Narcissism': 2, 'accept': 'Accept',
        'Game': 'Other', 'Hours': 5.0, 'earnings': 'No', 'whyplay': 'Fun',
        'League': 'Bronze', 'streams': 0.0
    }

    input_df = pd.DataFrame([user_input])
    print("üì• Initial input DataFrame:\n", input_df.head())

    # Add missing columns with default values
    for col in X.columns:
        if col not in input_df.columns:
            if col in encoders:
                # For categorical columns, use the first known class
                input_df[col] = encoders[col].classes_[0]
            else:
                # For numeric columns, use 0
                input_df[col] = 0

    # Reorder columns to match training data
    input_df = input_df.reindex(columns=X.columns, fill_value=0)
    print("üì• After adding missing columns:\n", input_df.head())

    # Encode categorical columns
    for col in input_df.columns:
        if col in encoders:
            try:
                # Convert to string first
                input_df[col] = input_df[col].astype(str)
                
                # Handle unseen labels by adding them to the encoder
                known_classes = set(encoders[col].classes_)
                input_classes = set(input_df[col])
                unseen = input_classes - known_classes

                if unseen:
                    # Add unseen classes to the encoder
                    encoders[col].classes_ = list(encoders[col].classes_) + list(unseen)

                # Transform the column
                input_df[col] = encoders[col].transform(input_df[col])
                print(f"‚úÖ Encoded column '{col}'")
            except Exception as e:
                print(f"‚ö†Ô∏è Encoding failed for column '{col}': {e}")
                # Use a default value if encoding fails
                input_df[col] = 0
        else:
            try:
                input_df[col] = pd.to_numeric(input_df[col], errors='coerce')
                # Fill any NaN values with 0
                input_df[col] = input_df[col].fillna(0)
                print(f"‚úÖ Converted column '{col}' to numeric")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not convert column '{col}' to numeric: {e}")
                input_df[col] = 0

    # Ensure all columns are numeric
    for col in input_df.columns:
        input_df[col] = pd.to_numeric(input_df[col], errors='coerce').fillna(0)
        
    print("üì• Final input to model:\n", input_df.head())
    print("üìä Input DataFrame shape:", input_df.shape)
    print("üìä Expected shape:", X.shape)
    
    # Check if shapes match
    if input_df.shape[1] != X.shape[1]:
        print(f"‚ùå Shape mismatch: input has {input_df.shape[1]} columns, expected {X.shape[1]}")
        return None
    
    prediction = ML(input_df)

    print("\nüîÆ Prediction value:", prediction)
    print("üì§ Received prediction:", prediction)

    if prediction is None:
        print("‚ùå Prediction failed due to input formatting issues.")
    elif prediction == 1:
        print("\nüßæ Prediction Result:\n‚Üí Anxiety Detected üòü")
    elif prediction == 0:
        print("\nüßæ Prediction Result:\n‚Üí No Anxiety Detected üòä")
    else:
        print(f"\nüßæ Unexpected prediction output: {prediction}")

predict_from_input()


üì• Initial input DataFrame:
    GAD1  GAD2  GAD3  GAD4  GAD5  GAD6  GAD7  SWL1  SWL2  SWL3  ...  SPIN16  \
0     2     2     2     1     2     3     2     3     4     5  ...       2   

   SPIN17 Narcissism  accept   Game Hours earnings whyplay  League  streams  
0       1          2  Accept  Other   5.0       No     Fun  Bronze      0.0  

[1 rows x 43 columns]
üì• After adding missing columns:
    GAD1  GAD2  GAD3  GAD4  GAD5  GAD6  GAD7  SWL1  SWL2  SWL3  ...  Age  \
0     2     2     2     1     2     3     2     3     4     5  ...   24   

       Work                    Degree   Birthplace  Residence    Reference  \
0  Employed  Bachelor (or equivalent)  Afghanistan    Albania  CrowdFlower   

      Playstyle  accept  SWL_T  SPIN_T  
0  Singleplayer  Accept      0       0  

[1 rows x 48 columns]
‚úÖ Converted column 'GAD1' to numeric
‚úÖ Converted column 'GAD2' to numeric
‚úÖ Converted column 'GAD3' to numeric
‚úÖ Converted column 'GAD4' to numeric
‚úÖ Converted column 'GAD5' 