<a href="https://colab.research.google.com/github/Triveni1349/NPL-LAB/blob/main/Untitled51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()


Saving training.1600000.processed.noemoticon.csv.zip to training.1600000.processed.noemoticon.csv.zip


In [2]:
# Option A: Keras CharCNN + Word Embedding + Soundex
# Requirements: tensorflow, pandas, scikit-learn, matplotlib, joblib
# Run on Colab for easiest TF availability.

import re, zipfile, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, accuracy_score
import joblib

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D, Dense, Dropout, concatenate, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# ------------- config -------------
ZIP_PATH = "/content/training.1600000.processed.noemoticon.csv.zip"  # adjust
N_SAMPLES = 100000            # increase/decrease depending on GPU/RAM
MAX_CHAR_LEN = 280
MAX_WORDS = 20000
MAX_WORD_LEN = 50
SND_MAX_FEAT = 500
BATCH_SIZE = 256
EPOCHS = 4                    # increase if you have time/GPU
# -----------------------------------

# --- load CSV from zip (Sentiment140 typical columns)
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    csvf = [f for f in z.namelist() if f.lower().endswith(".csv")][0]
    df = pd.read_csv(z.open(csvf), header=None, usecols=[0,5], names=["target","text"], encoding='latin-1')

df = df[df['target'].isin([0,4])].copy()
df['target'] = df['target'].map({0:0, 4:1})

if len(df) > N_SAMPLES:
    df = df.sample(N_SAMPLES, random_state=42).reset_index(drop=True)

# --- preprocessing
def clean_tweet(text):
    text = str(text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"[^A-Za-z0-9\s'`]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

df['clean'] = df['text'].apply(clean_tweet)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['target'])

# --- soundex helper
def soundex(word):
    if not word:
        return ""
    word = word.lower()
    first = word[0].upper()
    mapping = {'b':'1','f':'1','p':'1','v':'1',
               'c':'2','g':'2','j':'2','k':'2','q':'2','s':'2','x':'2','z':'2',
               'd':'3','t':'3',
               'l':'4',
               'm':'5','n':'5',
               'r':'6'}
    digits = []
    prev = ''
    for ch in word[1:]:
        d = mapping.get(ch, '0')
        if d != prev:
            digits.append(d); prev = d
    digits = [d for d in digits if d != '0']
    code = first + ''.join(digits)
    return (code + '000')[:4]

def tweet_to_soundex_codes(tweet):
    words = tweet.split()
    codes = [soundex(w) for w in words if w]
    return " ".join(codes)

# --- build features for Soundex vectorizer
from sklearn.feature_extraction.text import CountVectorizer
train_snd = train_df['clean'].apply(tweet_to_soundex_codes)
test_snd = test_df['clean'].apply(tweet_to_soundex_codes)
snd_vect = CountVectorizer(max_features=SND_MAX_FEAT)
Xs_train = snd_vect.fit_transform(train_snd).toarray()
Xs_test = snd_vect.transform(test_snd).toarray()

# --- word tokenizer (like FastText substitute if you don't have pretrained embeddings)
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['clean'].tolist())
X_word_train = pad_sequences(tokenizer.texts_to_sequences(train_df['clean'].tolist()), maxlen=MAX_WORD_LEN)
X_word_test = pad_sequences(tokenizer.texts_to_sequences(test_df['clean'].tolist()), maxlen=MAX_WORD_LEN)

# --- char-to-index mapping
all_text = " ".join(train_df['clean'].tolist())
chars = sorted(list(set(all_text)))
allowed = list("abcdefghijklmnopqrstuvwxyz0123456789 .,!?;:'\"/\\|-_@#&()[]{}")
charset = [c for c in allowed if c in chars]
if not charset:
    charset = chars
char_to_idx = {c:i+1 for i,c in enumerate(charset)}
def text_to_char_seq(s, maxlen=MAX_CHAR_LEN):
    seq = [char_to_idx.get(c,0) for c in s[:maxlen]]
    return seq

X_char_train = pad_sequences(train_df['clean'].apply(text_to_char_seq).tolist(), maxlen=MAX_CHAR_LEN, padding='post')
X_char_test = pad_sequences(test_df['clean'].apply(text_to_char_seq).tolist(), maxlen=MAX_CHAR_LEN, padding='post')

# --- labels
y_train = train_df['target'].values
y_test = test_df['target'].values

# --- build model (3-branch)
char_vocab_size = len(char_to_idx) + 1
char_input = Input(shape=(MAX_CHAR_LEN,), name='char_input')
char_emb = Embedding(input_dim=char_vocab_size, output_dim=50, name='char_emb')(char_input)
char_drop = SpatialDropout1D(0.2)(char_emb)
char_conv = Conv1D(filters=128, kernel_size=7, activation='relu')(char_drop)
char_pool = GlobalMaxPool1D()(char_conv)
char_dense = Dense(64, activation='relu')(char_pool)

word_input = Input(shape=(MAX_WORD_LEN,), name='word_input')
word_emb = Embedding(input_dim=MAX_WORDS, output_dim=100, name='word_emb')(word_input)
word_drop = SpatialDropout1D(0.2)(word_emb)
word_conv = Conv1D(filters=128, kernel_size=5, activation='relu')(word_drop)
word_pool = GlobalMaxPool1D()(word_conv)
word_dense = Dense(64, activation='relu')(word_pool)

snd_input = Input(shape=(Xs_train.shape[1],), name='snd_input')
snd_dense = Dense(64, activation='relu')(snd_input)

merged = concatenate([char_dense, word_dense, snd_dense])
x = Dense(128, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[char_input, word_input, snd_input], outputs=out)
model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# --- train
history = model.fit(
    x={'char_input': X_char_train, 'word_input': X_word_train, 'snd_input': Xs_train},
    y=y_train,
    validation_split=0.1,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

# --- evaluate & plots
y_proba = model.predict({'char_input': X_char_test, 'word_input': X_word_test, 'snd_input': Xs_test}).ravel()
y_pred = (y_proba >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

print("ROC AUC:", roc_auc)
print(classification_report(y_test, y_pred, digits=4))
# Plot the four graphs (confusion, ROC, epochs vs acc, loss vs acc) using matplotlib (one per figure)
# (standard plotting code omitted here for brevity — see Option B for example plotting)
# Save model & preprocessing:
joblib.dump({'model': model, 'tokenizer': tokenizer, 'snd_vect': snd_vect, 'char_to_idx': char_to_idx}, 'char_word_soundex_keras.pkl')
print("Saved preprocessing + model to char_word_soundex_keras.pkl")


Epoch 1/4
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 784ms/step - accuracy: 0.6834 - loss: 0.5787 - val_accuracy: 0.7893 - val_loss: 0.4493
Epoch 2/4
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 732ms/step - accuracy: 0.8249 - loss: 0.3889 - val_accuracy: 0.7880 - val_loss: 0.4486
Epoch 3/4
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 785ms/step - accuracy: 0.8847 - loss: 0.2789 - val_accuracy: 0.7839 - val_loss: 0.5117
Epoch 4/4
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 727ms/step - accuracy: 0.9293 - loss: 0.1789 - val_accuracy: 0.7678 - val_loss: 0.6547
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step
ROC AUC: 0.8551764314540612
              precision    recall  f1-score   support

           0     0.7407    0.8263    0.7812      7491
           1     0.8042    0.7114    0.7549      7509

    accuracy                         0.7688     15000
   macro avg     