In [None]:
!pip install kagglehub


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import kagglehub

nltk.download('stopwords')
nltk.download('wordnet')





path = kagglehub.dataset_download("kazanova/sentiment140")

path = f"{path}/training.1600000.processed.noemoticon.csv"
columns = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(path, encoding='latin-1', header=None, names=columns)

df['target'] = df['target'].replace(4, 1)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)





In [None]:
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
X = df['cleaned_text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [None]:
tfidf = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [None]:
smote = SMOTE(random_state=22, k_neighbors=3)  
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)


In [None]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_resampled, y_train_resampled)

rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_resampled, y_train_resampled)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")

print("Logistic Regression Evaluation:")
evaluate_model(lr_model, X_test_tfidf, y_test)

print("\nRandom Forest Evaluation:")
evaluate_model(rf_model, X_test_tfidf, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

lr_grid = GridSearchCV(LogisticRegression(random_state=42), lr_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
lr_grid.fit(X_train_resampled, y_train_resampled)
print("Best Logistic Regression Parameters:", lr_grid.best_params_)

rf_random = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1)
rf_random.fit(X_train_resampled, y_train_resampled)
print("Best Random Forest Parameters:", rf_random.best_params_)

In [None]:
!pip install torch 

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import kagglehub
from collections import Counter

nltk.download('stopwords')
nltk.download('wordnet')

path = kagglehub.dataset_download("kazanova/sentiment140")
csv_path = f"{path}/training.1600000.processed.noemoticon.csv"

columns = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(csv_path, encoding='latin-1', header=None, names=columns)
df['target'] = df['target'].replace(4, 1)  # Convert 4 to 1 for positive

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)

X = df['cleaned_text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab_size = 5000
max_len = 100

class Vocabulary:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {0: "<PAD>", 1: "<UNK>"}
        self.idx = 2
        
    def build_vocab(self, sentences):
        counter = Counter()  # This requires the import
        for sentence in sentences:
            counter.update(sentence.split())
        for word, _ in counter.most_common(vocab_size-2):
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def sentence_to_ids(self, sentence):
        return [self.word2idx.get(word, 1) for word in sentence.split()]

vocab = Vocabulary()
vocab.build_vocab(X_train)

def text_to_sequences(texts, vocab, max_len):
    sequences = []
    for text in texts:
        seq = vocab.sentence_to_ids(text)[:max_len]
        seq += [0]*(max_len - len(seq))  # Padding
        sequences.append(seq)
    return np.array(sequences)

X_train_pad = text_to_sequences(X_train, vocab, max_len)
X_test_pad = text_to_sequences(X_test, vocab, max_len)

X_train_tensor = torch.LongTensor(X_train_pad)
X_test_tensor = torch.LongTensor(X_test_pad)
y_train_tensor = torch.FloatTensor(y_train.values)
y_test_tensor = torch.FloatTensor(y_test.values)

batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.conv1 = nn.Conv1d(128, 128, 5)
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(128, 64, 5)
        self.fc1 = nn.Linear(64, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = nn.functional.relu(self.conv2(x))
        x = x.max(dim=2)[0]
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))
        return x.squeeze()

class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.lstm = nn.LSTM(128, 128, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(128, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (h_n, c_n) = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        x = torch.sigmoid(self.fc(x))
        return x.squeeze()

def train_model(model, train_loader, test_loader, epochs=5):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
        
        print(f'Epoch {epoch+1}/{epochs}')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}')
        print(f'Val Accuracy: {correct/total:.4f}\n')

print("Training CNN:")
cnn_model = CNN()
train_model(cnn_model, train_loader, test_loader)

print("\nTraining LSTM:")
lstm_model = LSTM()
train_model(lstm_model, train_loader, test_loader)

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total

print(f"CNN Test Accuracy: {evaluate_model(cnn_model, test_loader):.4f}")
print(f"LSTM Test Accuracy: {evaluate_model(lstm_model, test_loader):.4f}")

In [None]:
!pip install torch  transformers datasets evaluate accelerate


In [None]:
import os
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    AdamW,
    get_scheduler
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"  

dataset = load_dataset("sentiment140")
train_data = dataset["train"].shuffle(seed=42).select(range(5000))  
test_data = dataset["test"]

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

for param in model.bert.parameters():
    param.requires_grad = False  

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  
    learning_rate=1e-5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  
    weight_decay=0.01,
    logging_steps=10,
    report_to="none",
    fp16=True,  
    gradient_accumulation_steps=2,  
    push_to_hub=False  
)

optimizer = AdamW(model.parameters(), lr=1e-5)
num_training_steps = len(tokenized_train) * training_args.num_train_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)  
)

trainer.train()




In [None]:
results = trainer.evaluate()
print(results)



In [None]:
text = "I love this movie! It’s fantastic."

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
predictions = outputs.logits.softmax(dim=1).argmax().item()

print("Positive" if predictions == 1 else "Negative")

In [None]:
model.save_pretrained("./sentiment140_finetuned")
tokenizer.save_pretrained("./sentiment140_finetuned")