#IMPORTING THE LIBRARIES

In [103]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [104]:
tweets_df= pd.read_csv('/content/tweets.csv')

In [105]:
tweets_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,5894
1,2026


#DATA PREPROCESSING

In [106]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [107]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [108]:
# Preprocessing function
def preprocess_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet)
    # Remove mentions and hashtags
    tweet = re.sub(r"@\w+|#\w+", '', tweet)
    # Remove special characters, numbers, and punctuation
    tweet = re.sub(r"[^a-zA-Z\s]", '', tweet)
    # Tokenize the tweet
    words = word_tokenize(tweet)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return " ".join(words)

In [109]:
nltk.download('punkt_tab')
tweets_df['cleaned_tweet'] = tweets_df['tweet'].apply(preprocess_tweet)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [110]:
tweets_df.head()

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,test
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally transparant silicon case thanks uncle
2,3,0,We love this! Would you go? #talk #makememorie...,love would go
3,4,0,I'm wired I know I'm George I was made that wa...,im wired know im george made way
4,5,1,What amazing service! Apple won't even talk to...,amazing service apple wont even talk question ...


In [111]:
#extracting features using TF-IDF

In [112]:
vectorizer = TfidfVectorizer()
x=vectorizer.fit_transform(tweets_df['cleaned_tweet']).toarray()

In [113]:
y=tweets_df['label']

#SPLITTING THE DATASET

In [114]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

#TRAINING THE MODEL

In [115]:
model = LogisticRegression()
model.fit(x_train,y_train)

#EVALUATING THE MODEL

In [116]:
y_pred = model.predict(x_test)

In [117]:
accuracy=accuracy_score(y_test,y_pred)

In [118]:
report = classification_report(y_test,y_pred)

In [119]:
print("Accuracy:",accuracy)

Accuracy: 0.8308080808080808


# SENTIMENT ANALYSIS USING SVM

In [120]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [121]:
svm_model = make_pipeline(StandardScaler(with_mean=False),SVC(kernel='linear',C=1.0,random_state=42))

In [122]:
svm_model.fit(x_train, y_train)

#EVALUATING THE MODEL



In [123]:
y_pred = svm_model.predict(x_test)

In [124]:
accuracy=accuracy_score(y_test,y_pred)

In [125]:
report = classification_report(y_test,y_pred)

In [126]:
print("Accuracy:",accuracy)

Accuracy: 0.7986111111111112


In [127]:
print("Classification Report:\n",report)

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.94      0.87      1152
           1       0.73      0.42      0.53       432

    accuracy                           0.80      1584
   macro avg       0.77      0.68      0.70      1584
weighted avg       0.79      0.80      0.78      1584

