In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Job/data/copy_sentiment140.csv', encoding='latin1', header=None)

# Assign column names
data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Sample a fraction of the data, e.g., 33%
fraction = 0.1
data = data.sample(frac=fraction, random_state=42)

# Display the first few rows of the dataset
data.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,target,ids,date,flag,user,text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem


# Preprocessing and Lemmatization

In [None]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to get wordnet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Advanced preprocessing function
def preprocess_text_advanced(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuation and lowercase the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize and POS tag
    words = nltk.word_tokenize(text)
    words_pos = nltk.pos_tag(words)
    # Lemmatize based on POS tags and remove stopwords
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in words_pos if word not in stop_words]
    return lemmatized_words

# Apply the advanced preprocessing to the text column
data['cleaned_text_advanced'] = data['text'].apply(preprocess_text_advanced)

# Convert target values (0, 4) to binary (0, 1)
data['target'] = data['target'].apply(lambda x: 0 if x == 0 else 1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Train Word2Vec Model

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Extract the preprocessed text data as a list of lists of words
sentences = data['cleaned_text_advanced'].tolist()

# Adjust Word2Vec parameters
word2vec_model = Word2Vec(sentences, vector_size=200, window=10, min_count=2, workers=4, epochs=10)

# Function to average Word2Vec vectors for a sentence
def vectorize_sentence(sentence, model):
    valid_words = [word for word in sentence if word in model.wv.key_to_index]
    if not valid_words:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

# Vectorize the sentences using Word2Vec
X = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in sentences])


# Split Data into Training and Test Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['target'], test_size=0.2, random_state=42)


# Training and Evaluating Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model on Word2Vec embeddings
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Final Model Accuracy: {accuracy_score(y_test, y_pred)}")
print("Final Model Classification Report:\n", classification_report(y_test, y_pred))
print("Final Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Final Model Accuracy: 0.74634375
Final Model Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.73      0.74     15878
           1       0.74      0.76      0.75     16122

    accuracy                           0.75     32000
   macro avg       0.75      0.75      0.75     32000
weighted avg       0.75      0.75      0.75     32000

Final Model Confusion Matrix:
 [[11571  4307]
 [ 3810 12312]]
