## Data Importing and Cleaning

#### *Installing and Downloading Packages*

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#### *Mounting Google Drive*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

####*Importing the Dataset file*

In [None]:
csv_path = '/content/drive/MyDrive/Colab Notebooks/Sentiment_analysis/Dataset/training.1600000.processed.noemoticon.csv'
encodings = ['utf-8', 'ISO-8859-1', 'cp1252']
df = None
for encoding in encodings:
    try:
        df = pd.read_csv(csv_path, encoding=encoding)
        break
    except UnicodeDecodeError:
        pass

#### *Giving Column Names*

In [None]:
new_column_names = ['score', 'id' ,'Date', 'Query' , 'username' , 'text']
df.columns = new_column_names

## --> Data Transformation

##### *Cleaning of text using Lowercasing , removal of Unwanted characters , Tokenization , Stop Word Removal , Lemmatization*

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Join lemmatized tokens back into a cleaned text
    cleaned_text = ' '.join(lemmatized_tokens)

    return cleaned_text

# Apply data preprocessing to the 'text' column of the DataFrame
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Print a sample of the cleaned text
print(df['cleaned_text'].head())

In [None]:
df

## --> Installing Required modules

In [None]:
!pip install transformers torch

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

## --> Implementing BERT model

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
num_labels = 3  # Three classes: positive, negative, neutral
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

####--> Classifying as Positive Negative and Neutral

In [None]:
class_labels = ['negative', 'neutral', 'positive']

In [None]:
cleaned_texts = df['cleaned_text'].tolist()
scores = df['score'].tolist()
encoded_texts = tokenizer(cleaned_texts, padding=True, truncation=True, return_tensors='pt')
input_ids = encoded_texts['input_ids']
attention_mask = encoded_texts['attention_mask']
labels = torch.tensor(scores)

###--> Splitting Train and Test Dataset

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

batch_size = 16
train_dataset = TensorDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_data, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### --> Optimization

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

## --> Training the model

In [None]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    # ... training loop
    # Validation loop
    model.eval()
    # ... validation loop

## -->Model Evalutation with Accuracy , Classification Report and Confusion Matrix

In [None]:
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, labels = batch
        outputs = model(input_ids)
        predicted = torch.argmax(outputs.logits, dim=1)
        test_predictions.extend(predicted.cpu().numpy())

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test Accuracy: {test_accuracy:.4f}')

# Classification report
target_names = ['negative', 'neutral', 'positive']
print(classification_report(test_true_labels, test_predictions, target_names=target_names))

# Confusion matrix
confusion = confusion_matrix(test_true_labels, test_predictions)
print("Confusion Matrix:")
print(confusion)