### Imports and Setups
import emoji (This module is used to remove emojis from text but the emojis are already stripped from the Sentiment140 dataset so it is not used in this case)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import pickle
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

### Loading Dataset
I chose Sentiment140 Tweet dataset becuase it contains 1.6 M labeled tweets, and is a good example to test machine learning skills on high amount of data.

In [None]:
df = pd.read_csv("database//tweets.csv", names = ['sentiment', 'ids', 'date', 'flag', 'user', 'text'])
print(df.shape)
df.head()

### Data Visualization
0 is for negative and 4 is for positive

In [None]:
df['sentiment'].value_counts().plot(kind = 'bar', color = ['red','green'])
plt.title("Sentiment Distribution (Raw Data)")
plt.xlabel("Sentiment (0 = negative, 4 = positive)")
plt.ylabel("Number of tweets")
plt.show()

In [None]:
df['tweet_length'] = df['text'].apply(len)

plt.hist(df['tweet_length'], bins = 50, color = 'skyblue', edgecolor = 'black')
plt.title("Tweet Length Distribution (Raw Data)")
plt.xlabel("Length (characters)")
plt.ylabel("Number of Tweets")
plt.show()

### Data Cleaning

In [None]:
def cleaner(string):
    string = re.sub(r"http\S+|www\S+", "", string)
    string = re.sub(r"@\w+", "[USER]", string)
    string = re.sub(r"#", "", string)
    string = re.sub(r"\s+", " ", string)
    string = string.lower().strip()
    return string

df["processed_text"] = df["text"].apply(cleaner)
df[["text", "processed_text"]].head()

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize(string):
    words = [lemmatizer.lemmatize(word) for word in string.split()]
    result = ' '.join(words)
    return result

df["lemmatized_text"] = df["processed_text"].apply(lemmatize)
df[["text", "lemmatized_text"]].head()

### Train Test Data Split
The data is split in train and test parts in ratio 9:1.

In [None]:
x = df["lemmatized_text"]
y = df["sentiment"].astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 10, stratify = y)
print("X Train: ", x_train.shape)
print("Y Train: ", y_train.shape)
print("X Test: ", x_test.shape)
print("Y Test: ", y_test.shape)

### Logistic Regression Model Training
This model has limitation that it doesn't understand sarcasm and contextual text due to use of TfidfVectorizer.

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 20000, ngram_range = (1,2))
x_train_int = vectorizer.fit_transform(x_train)
x_test_int = vectorizer.transform(x_test)

In [None]:
model = LogisticRegression(max_iter = 500, C = 0.5)
model.fit(x_train_int, y_train)

### Evaluating Model
Changed model's accuracy by setting hyperparameter, number of iterations and max number of features for vectorizer resulting in the model's accuracy to 78.39% which is very accurate for such a big dataset.

In [None]:
y_test_pred = model.predict(x_test_int)

print(accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

# Save model and vectorizer
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print('Model and vectorizer saved successfully!')