In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Job/data/copy_sentiment140.csv', encoding='latin1', header=None)

# Assign column names
data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Sample a fraction of the data, e.g., 33%
fraction = 0.05
data = data.sample(frac=fraction, random_state=42)

# Display the first few rows of the dataset
data.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,target,ids,date,flag,user,text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem


# Preprocessing Text Data

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function to clean the text
def preprocess_text(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuation and lowercase the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Remove stopwords
    text = ' '.join([word for word in nltk.word_tokenize(text) if word not in stop_words])
    return text

# Apply the preprocessing to the text column
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Convert target values (0, 4) to binary (0, 1)
data['target'] = data['target'].apply(lambda x: 0 if x == 0 else 1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Split Data into Training and Test Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['target'], test_size=0.2, random_state=42)


# Pre-trained BERT Model and Tokenizer

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to('cuda')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Dataset and DataLoader for Batch Processing

In [None]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        return inputs['input_ids'].squeeze(), inputs['attention_mask'].squeeze()

# Create dataset and dataloader
batch_size = 32  # Adjust based on GPU memory
dataset = TextDataset(X_train, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)

# Ensure the model is on the GPU
bert_model = bert_model.to('cuda')




# BERT Embeddings

In [None]:
embeddings = []

# Get BERT embeddings
for input_ids, attention_mask in dataloader:
    input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    embeddings.extend(cls_embeddings)

X_train_bert = np.array(embeddings)

# Display the shape of the BERT embeddings
X_train_bert.shape

(64000, 768)

## Dataset and DataLoader for the Test Set

In [None]:
# Assuming you have `X_test` data, which is the cleaned text of the test set

# Create dataset for test data
test_dataset = TextDataset(X_test, tokenizer, max_length=512)

# Create dataloader for test data
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)


# Extract BERT Embeddings for the Test Set

In [None]:
test_embeddings = []

for input_ids, attention_mask in test_dataloader:
    input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    test_embeddings.extend(cls_embeddings)

X_test_bert = np.array(test_embeddings)


## Scaling BERT

In [None]:
from sklearn.preprocessing import StandardScaler

# Scaling the data
scaler = StandardScaler()
X_train_bert_scaled = scaler.fit_transform(X_train_bert)
X_test_bert_scaled = scaler.transform(X_test_bert)

# Training and Evaluating Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Training the Logistic Regression model
model = LogisticRegression(max_iter=1000, C=1.0)
model.fit(X_train_bert_scaled, y_train)

# Making predictions and evaluating the model
y_pred = model.predict(X_test_bert_scaled)

# Evaluate the model
print(f"Final Model Accuracy: {accuracy_score(y_test, y_pred)}")
print("Final Model Classification Report:\n", classification_report(y_test, y_pred))
print("Final Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Final Model Accuracy: 0.479125
Final Model Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.48      0.48      8005
           1       0.48      0.47      0.48      7995

    accuracy                           0.48     16000
   macro avg       0.48      0.48      0.48     16000
weighted avg       0.48      0.48      0.48     16000

Final Model Confusion Matrix:
 [[3877 4128]
 [4206 3789]]


In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_train_bert_scaled, y_train, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean()}")


Cross-Validation Scores: [0.50484375 0.50398438 0.50109375 0.50578125 0.50679688]
Mean CV Accuracy: 0.5045
