In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

# Define a function for preprocessing
def preprocess_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)
    
    # Remove special characters, numbers, and punctuations, except for @ and #
    tweet = re.sub(r'[^a-zA-Z#@]', ' ', tweet)
    
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Rejoin tokens into a single string
    processed_tweet = ' '.join(tokens)
    
    return processed_tweet

# Load data
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv("twitter_new.csv", encoding="ISO-8859-1", names=column_names)

# Limit the dataset for faster testing (you can remove this for the entire dataset)
#df = df.sample(frac=1, random_state=42).head(500000)

# Preprocess tweets
df['text'] = df['text'].apply(preprocess_tweet)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=500000)
X = tfidf_vectorizer.fit_transform(df['text'])

# Label encoding for the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['target'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)

# Save the trained Logistic Regression Classifier and TF-IDF vectorizer using joblib
joblib.dump(lr_classifier, "logistic_classifier.joblib")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

# Evaluate the Logistic Regression Classifier
accuracy = accuracy_score(y_test, y_pred_lr)
report = classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive'])
conf_matrix = confusion_matrix(y_test, y_pred_lr)

print("Logistic Regression Classifier Evaluation:")
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')
print(f'Confusion Matrix:\n{conf_matrix}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classifier Evaluation:
Accuracy: 0.78133125
Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.76      0.78    159494
    Positive       0.77      0.81      0.79    160506

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

Confusion Matrix:
[[120588  38906]
 [ 31068 129438]]
