## Loading Data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test_nolabel.csv')
val_df = pd.read_csv('data/val.csv')

In [2]:
print(train_df.head())
print(train_df.info())

     id  label                                               text
0  8901      5  Bennett 's naturalistic performance speaks vol...
1  2506      5  Shot in rich , shadowy black-and-white , Devil...
2  2381      5  More than their unique residences , Home Movie...
3  1262      3  The movie should be credited with remembering ...
4  2542      4  Audiences are advised to sit near the back and...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8298 entries, 0 to 8297
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      8298 non-null   int64 
 1   label   8298 non-null   int64 
 2   text    8298 non-null   object
dtypes: int64(2), object(1)
memory usage: 194.6+ KB
None


### Check class Imbalance

In [3]:
print(train_df["label"].value_counts(normalize=True))
print(val_df["label"].value_counts(normalize=True))


label
2    0.263919
4    0.261388
3    0.192818
5    0.156905
1    0.124970
Name: proportion, dtype: float64
label
4    0.275028
2    0.257593
3    0.173791
5    0.151856
1    0.141732
Name: proportion, dtype: float64


## Preprocess

In [4]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer


def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stopwords
    return text




### Apply text cleaning

In [5]:
train_df["clean_text"] = train_df["text"].apply(preprocess_text)
val_df["clean_text"] = val_df["text"].apply(preprocess_text)
test_df["clean_text"] = test_df["text"].apply(preprocess_text)

## Convert Text to Numerical Features (TF-IDF

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')

#### Transform the text data

In [7]:
X_train = vectorizer.fit_transform(train_df["clean_text"]).toarray()
X_val = vectorizer.transform(val_df["clean_text"]).toarray()
X_test = vectorizer.transform(test_df["clean_text"]).toarray()

In [8]:
y_train = train_df["label"].values
y_val = val_df["label"].values

### Class weighting

In [9]:
from keras.src.layers import BatchNormalization

model = Sequential([
    Dense(1024, activation="relu", input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(512, activation="relu"),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation="relu"),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer=Adam(learning_rate=0.0005), loss="binary_crossentropy", metrics=["accuracy"])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Train the Model

In [10]:
import tensorflow as tf
with tf.device('/GPU:0'):
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,  # Increase if needed
        batch_size=32
    )


Epoch 1/50
[1m 22/260[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:27[0m 369ms/step - accuracy: 0.0884 - loss: -1.0746

KeyboardInterrupt: 

## Evaluate the Model

In [12]:
# Predict on validation set
y_val_pred = (model.predict(X_val) > 0.5).astype(int)

# Accuracy score
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Classification Report
print(classification_report(y_val, y_val_pred))


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
Validation Accuracy: 0.1417
              precision    recall  f1-score   support

           1       0.14      1.00      0.25       252
           2       0.00      0.00      0.00       458
           3       0.00      0.00      0.00       309
           4       0.00      0.00      0.00       489
           5       0.00      0.00      0.00       270

    accuracy                           0.14      1778
   macro avg       0.03      0.20      0.05      1778
weighted avg       0.02      0.14      0.04      1778



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Make Predictions on the Test Set

In [13]:
y_test_pred = (model.predict(X_test) > 0.5).astype(int)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step


In [16]:
submission = pd.DataFrame({"id": test_df["id"], "label": y_test_pred.flatten()})
submission.to_csv("submission2.csv", index=False)


In [30]:
print(tf.__version__)


2.18.0


In [1]:
tf.test.is_gpu_available()

NameError: name 'tf' is not defined