## Import needed labraries

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

#### Downloading NLTK (Natural Language Toolkit) data
#### unzipping it and setting data path

In [None]:
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora
nltk.data.path.append("/kaggle/working/nltk_data/")

## Reading Data

## Data Preprocessing

In [3]:

training_data =pd.read_csv("twitter_training.csv")
test_data=pd.read_csv("twitter_validation.csv")


In [4]:

test_data.columns = ['Header1', 'company','labels','text']
training_data.columns = ['Header1', 'company','labels','text']
training_data.drop(columns=["Header1","company"],inplace=True)
test_data.drop(columns=["Header1","company"],inplace=True)


In [5]:
print("training:\n",training_data)
print("\n\ntest:\n",test_data)

training:
          labels                                               text
0      Positive  I am coming to the borders and I will kill you...
1      Positive  im getting on borderlands and i will kill you ...
2      Positive  im coming on borderlands and i will murder you...
3      Positive  im getting on borderlands 2 and i will murder ...
4      Positive  im getting into borderlands and i can murder y...
...         ...                                                ...
74676  Positive  Just realized that the Windows partition of my...
74677  Positive  Just realized that my Mac window partition is ...
74678  Positive  Just realized the windows partition of my Mac ...
74679  Positive  Just realized between the windows partition of...
74680  Positive  Just like the windows partition of my Mac is l...

[74681 rows x 2 columns]


test:
          labels                                               text
0       Neutral  BBC News - Amazon boss Jeff Bezos rejects clai...
1      Negative 

In [6]:
sentiment=pd.concat([training_data,test_data],ignore_index=True)
sentiment.dropna(inplace=True)
sentiment.drop_duplicates(inplace=True) 

In [7]:
sentiment

Unnamed: 0,labels,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
75668,Neutral,♥️ Suikoden 2\n1️⃣ Alex Kidd in Miracle World\...
75669,Positive,Thank you to Matching funds Home Depot RW paym...
75671,Neutral,Late night stream with the boys! Come watch so...
75675,Irrelevant,⭐️ Toronto is the arts and culture capital of ...


In [19]:
def process_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Remove extra white space from text

    text = re.sub(r'\W', ' ', str(text)) # Remove all the special characters from text

    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Remove all single characters from text

    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove any character that isn't alphabetical

    text = text.lower()

    words = word_tokenize(text) # tokenizes the text into words

    lemmatizer = WordNetLemmatizer() #  used to reduce words to their dictionary (base) form
    words = [lemmatizer.lemmatize(word) for word in words]

    stop_words = set(stopwords.words("english"))
    Words = [word for word in words if word not in stop_words] #removes stopwords from the list of words.

    Words = [word for word in Words if len(word) >= 3] # removes words with a len less than or equal to 3 characters.

    indices = np.unique(Words, return_index=True)[1] # gets the indices of unique words in the list of words.
    
    # to create a cleaned list of words by sorting the indices and retrieving corresponding words from the list:
    cleaned_text = np.array(Words)[np.sort(indices)].tolist()

    return cleaned_text

In [20]:
x=sentiment.drop('labels',axis=1)
y=sentiment.labels

In [21]:
print('x:\n ',x)
print('\n\ny:\n ',y)

x:
                                                      text
0      I am coming to the borders and I will kill you...
1      im getting on borderlands and i will kill you ...
2      im coming on borderlands and i will murder you...
3      im getting on borderlands 2 and i will murder ...
4      im getting into borderlands and i can murder y...
...                                                  ...
75668  ♥️ Suikoden 2\n1️⃣ Alex Kidd in Miracle World\...
75669  Thank you to Matching funds Home Depot RW paym...
75671  Late night stream with the boys! Come watch so...
75675  ⭐️ Toronto is the arts and culture capital of ...
75676  tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...

[70251 rows x 1 columns]


y:
  0          Positive
1          Positive
2          Positive
3          Positive
4          Positive
            ...    
75668       Neutral
75669      Positive
75671       Neutral
75675    Irrelevant
75676    Irrelevant
Name: labels, Length: 70251, dtype: object


### Training-Building model

In [22]:
cleaned_text = [process_text(text) for text in list(x['text'])]

# Convert text data into numerical features using CountVectorizer: 
cleaned_text_str = [' '.join(text) for text in cleaned_text]  # joins each list of words into a single string.

#initializes a CountVectorizer object,to convert a collection of text documents to a matrix of token counts.
vectorizer = CountVectorizer(lowercase=False)  
X = vectorizer.fit_transform(cleaned_text_str)

# Split the data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict sentiment on the test set:
y_pred = nb_classifier.predict(X_test)

In [None]:
cleaned_text_str

In [None]:
cleaned_text

In [15]:
X

<70251x25302 sparse matrix of type '<class 'numpy.int64'>'
	with 620984 stored elements in Compressed Sparse Row format>

In [12]:
## Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [27]:
# Assuming you have already preprocessed the specific text using process_text function
processed_specific_text = process_text("that is very good")

# Transform the preprocessed text into numerical features
specific_text_features = vectorizer.transform(processed_specific_text)  # Pass as a list of tokens

# Use the trained Naive Bayes classifier to predict sentiment
predicted_sentiment = nb_classifier.predict(specific_text_features)

# Output the predicted sentiment
print("Predicted Sentiment:", predicted_sentiment)


Predicted Sentiment: ['Positive']
