In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.figure_factory as ff

In [None]:
df=pd.read_csv('Combined Data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df[df['statement']==' ']

In [None]:
df[df['status']==' ']

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.fillna('',inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.sample(5)

In [None]:
df[df['statement'].str.strip()==' ']

In [None]:
df['cleaned_comment']=df['statement'].str.lower()

In [None]:
url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F] [0-9a-fA-F]))+'
comment_with_url=df[df['cleaned_comment'].str.contains(url,regex=True)]
comment_with_url

In [None]:
df['cleaned_comment'].replace(url,' ',regex=True,inplace=True)

In [None]:
df['statement'][4786]

In [None]:
df['cleaned_comment'][4786]

In [None]:
df[df['cleaned_comment'].str.contains('\n')]

In [None]:
df['cleaned_comment'].replace('\n',' ',regex=True,inplace=True)

In [None]:
df[df['cleaned_comment'].str.contains('\n')]

In [None]:
df['cleaned_comment'][223]

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""

    text = text.lower()  # Convert to lowercase
    
    # Fix: Remove URLs (http, https, www)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Fix: Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Fix: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Fix: Remove newlines properly
    text = text.replace("\n", " ")

    # Fix: Remove words containing numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Test with a sample
sample_text = "Visit https://example.com! New product <b>50% off</b>. Call now: 123-456-7890."
print(preprocess_text(sample_text))

In [None]:
df['cleaned_comment']=df['cleaned_comment'].apply(lambda x: preprocess_text(x))

In [None]:
df['cleaned_comment'][822]

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(data=df,x='status')
plt.show()

In [None]:
df['word_count']=df['cleaned_comment'].apply(lambda x: len(x.split()))

In [None]:
plt.figure(figsize=(12, 8))
sns.displot(data=df, x='word_count', hue='status', kind='kde', fill=True, palette='viridis')
plt.title('Distribution of Word Count by Status', fontsize=16)
plt.xlabel('Word Count', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.show()

In [None]:
nltk.download('stopwords')

In [None]:
stop_words=set(stopwords.words('english'))

#  create a new column 'num_stop_words' by counting the number of stopwords in each comment
df['num_stop_words']=df['cleaned_comment'].apply(lambda x: len([word for word in x.split() if word in stop_words]))

In [None]:
# create a distribution plot of 'num_stop_words' column
plt.figure(figsize=(10,6))

sns.histplot(df['num_stop_words'],kde=True)
plt.title('distribution of stop words')
plt.xlabel('number of stop words')
plt.ylabel('frequency')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=df,x='status',y='num_stop_words',estimator=np.median)
plt.show()

In [None]:
from collections import Counter

# extract all stopwords from clean_comment column
all_stop_words=[word for comment in df['cleaned_comment'] for word in comment.split() if word in stop_words]

# count the frequency of each stopword
most_common_stop_words=Counter(all_stop_words).most_common(25)

top_25_df=pd.DataFrame(most_common_stop_words,columns=['stopword','count'])
top_25_df.head()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(data=top_25_df,y='stopword',x='count',palette='viridis')
plt.title('top 25 most common stopwords')
plt.xlabel('count')
plt.ylabel('stopword')
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_word_count(text):
  wordcloud=WordCloud(width=800,height=400,background_color='white').generate(' '.join(text))
  plt.figure(figsize=(10,5))
  plt.imshow(wordcloud,interpolation='bilinear')
  plt.axis('off')
  plt.show()

In [None]:
plot_word_count(df['cleaned_comment'])

In [None]:
x=df['cleaned_comment']
y=df['status']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [None]:
vectorizer=TfidfVectorizer()

In [None]:
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
lr=LogisticRegression()
mnb=MultinomialNB()

In [None]:
lr.fit(x_train_tfidf,y_train)
y_pred=lr.predict(x_test_tfidf)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_fig = ff.create_annotated_heatmap(
    z=cm,
    x=list(set(y_test)),
    y=list(set(y_test)),
    annotation_text=cm,
    colorscale='Viridis'
)
cm_fig.update_layout(title='Confusion Matrix')
cm_fig.show()

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
mnb.fit(x_train_tfidf,y_train)
y_pred=mnb.predict(x_test_tfidf)
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_fig = ff.create_annotated_heatmap(
    z=cm,
    x=list(set(y_test)),
    y=list(set(y_test)),
    annotation_text=cm,
    colorscale='Viridis'
)
cm_fig.update_layout(title='Confusion Matrix')
cm_fig.show()

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
dt=DecisionTreeClassifier()
dt.fit(x_train_tfidf,y_train)
y_pred=dt.predict(x_test_tfidf)
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_fig = ff.create_annotated_heatmap(
    z=cm,
    x=list(set(y_test)),
    y=list(set(y_test)),
    annotation_text=cm,
    colorscale='Viridis'
)
cm_fig.update_layout(title='Confusion Matrix')
cm_fig.show()

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
# rf=RandomForestClassifier()
# rf.fit(x_train_tfidf,y_train)
# y_pred=rf.predict(x_test_tfidf)
# # Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)
# cm_fig = ff.create_annotated_heatmap(
#     z=cm,
#     x=list(set(y_test)),
#     y=list(set(y_test)),
#     annotation_text=cm,
#     colorscale='Viridis'
# )
# cm_fig.update_layout(title='Confusion Matrix')
# cm_fig.show()

In [None]:
# accuracy_score(y_test,y_pred)

In [None]:
# Install required libraries for BERT
# !pip install transformers torch

# Import necessary libraries for BERT
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.figure_factory as ff

In [None]:


# Ensure device is set (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Prepare the data for BERT
# Since BERT requires tokenized input, we use BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the labels (status) into numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_labels = len(label_encoder.classes_)

# Split the data again (same split as before for consistency)
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, random_state=42, test_size=0.2)

# Tokenize the text data for BERT
def tokenize_data(texts, max_length=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            max_length=max_length,    # Maximum length for BERT input
            padding='max_length',     # Pad to max_length
            truncation=True,          # Truncate longer texts
            return_attention_mask=True,
            return_tensors='pt'       # Return PyTorch tensors
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize training and testing data
x_train_ids, x_train_masks = tokenize_data(x_train)
x_test_ids, x_test_masks = tokenize_data(x_test)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create TensorDatasets and DataLoaders for batch processing
train_dataset = TensorDataset(x_train_ids, x_train_masks, y_train_tensor)
test_dataset = TensorDataset(x_test_ids, x_test_masks, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 2: Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,  # Number of unique labels
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

# Step 3: Set up optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3

# Step 4: Fine-tune BERT
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Step 5: Evaluate BERT model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Step 6: Calculate accuracy and confusion matrix
bert_accuracy = accuracy_score(true_labels, predictions)
print(f"BERT Accuracy: {bert_accuracy:.4f}")

# Confusion Matrix
cm = confusion_matrix(true_labels, predictions)
cm_fig = ff.create_annotated_heatmap(
    z=cm,
    x=list(label_encoder.classes_),
    y=list(label_encoder.classes_),
    annotation_text=cm,
    colorscale='Viridis'
)
cm_fig.update_layout(title='Confusion Matrix for BERT')
cm_fig.show()

# Optional: Compare with previous models
print("Model Performance Comparison:")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr.predict(x_test_tfidf)):.4f}")
print(f"Multinomial Naive Bayes Accuracy: {accuracy_score(y_test, mnb.predict(x_test_tfidf)):.4f}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, dt.predict(x_test_tfidf)):.4f}")
print(f"BERT Accuracy: {bert_accuracy:.4f}")