# Twitter Sentiment analysis Project

In [3]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet, stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.pipeline import Pipeline

# Ensure NLTK resources are downloaded
print("Downloading NLTK resources...")
nltk.download('punkt')           # Tokenizer
nltk.download('stopwords')       # Stopwords corpus
nltk.download('wordnet')         # WordNet lemmatizer
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('omw-eng')         # Open Multilingual WordNet


# Download the POS tagger with the correct name
nltk.download('averaged_perceptron_tagger')
# OR try downloading it with '_eng' suffix
nltk.download('averaged_perceptron_tagger_eng', quiet=False)


print("NLTK setup complete!")

Downloading NLTK resources...
NLTK setup complete!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Error loading omw-eng: Package 'omw-eng' not found in
[nltk_data]     index
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downlo

In [4]:
DATASET_COLUMNS = ['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv("D:\Robotics\Project Database\Sentiment analysis Data\data.1600000.processed.noemoticon.csv",encoding= DATASET_ENCODING, names=DATASET_COLUMNS)

In [5]:

df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
print(df['target'].value_counts())

target
0    800000
4    800000
Name: count, dtype: int64


In [7]:
df['target']=df['target'].replace(4,1)
print(df['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


# Data cleaning --> Text Preparing for the training and testing 

- ### Steps in Text Cleaning for the project 
1. Remove urls
2. Replave mentions with 'User'
3. Remove hashtags but keep the text 
4. Remove digits
5. Remove extra whitespace
6. Strip Leading and trailing white spaces 
7. Remove stopwords 
8. Tokenize the text
9. Lemmatize the text 




In [8]:
# Creating a function for cleaning the data 

def clean_text(text):

    stopwordlist = [
        'a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
        'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
        'being', 'below', 'between', 'both', 'by', 'can', 'd', 'did', 'do',
        'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from',
        'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
        'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
        'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
        'me', 'more', 'most', 'my', 'myself', 'needn', 'no', 'nor', 'now',
        'o', 'of', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
        'out', 'own', 're', 's', 'same', 'she', "shes", 'should', "shouldve", 'so', 'some', 'such',
        't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
        'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
        'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was',
        'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom',
        'why', 'will', 'with', 'won', 'y', 'you', "youd", "youll", "youre",
        "youve", 'your', 'yours', 'yourself', 'yourselves'
    ]


    # Function to get NLTK POS tag to WordNEt POS tag

    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    # Convert text to lower cases 
    text = text.lower()
   
    

    # Remove Leading and trailing space
    text = text.strip(" ")
   

    # Removing the url 
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
   


    # Replace @mentions with 'USER'
    text = re.sub(r'@[\S]+','USER',text)
    

    # Remove hashtags but keep the text 
    text = re.sub(r'#(\S+)',' ',text)

    # Remove digits 
    text= re.sub(r'\d+','',text)

    # Remove Extra whitespace 
    text = re.sub(r'\s+',' ',text)

    # Remove stopwrods 
    text = " ".join([word for word in text.split() if word not in stopwordlist])

    # Tokenize text 
    tokenizer = RegexpTokenizer(r'\w+\[^\w\s]')
    tokens = tokenizer.tokenize(text)


    # Part of speecht
    pos_tags = nltk.pos_tag(tokens)


    # Lemmatize the text 
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token ,get_wordnet_pos(tag)) for token , tag in pos_tags]

    return " ".join(lemmatized_tokens)
    
    
     


    



In [9]:
df['target']=df['target'].replace(4,1)

In [10]:
df['text']= df['text'].apply(clean_text)

In [18]:
df.columns

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')

## Splitting the data into training and testing data 

### First will will split the data to features and target 

In [23]:
feature = ["ids","date","flag","user","text"]

In [14]:
target = 'target'

In [24]:
# Features 
x = df[feature]

In [25]:
x.head()

Unnamed: 0,ids,date,flag,user,text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,
3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,
4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,


In [19]:
y = df[target]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
x_train , x_test, y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

### Term frequency and inverse document frequency 

- used to evaluate the importance of the term wrt to whole document 

- Term Frequency = No. of times a specific data / word appear in the document 
- inverse document frequency = Measures how imp is that data/word by considering how rare or common that data / word is 

- Corpus = [
    "Data analysis with TF-IDF is powerful.",

    "TF-IDF helps in text preprocessing .",
    
    "Text preprocessing and data analysis are crucial."
]

In [27]:
# initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=50000,ngram_range=(1,20))

# Fit and transform the training data
x_train_vect = vectorizer.fit_transform(x_train)

# Transform the testing data 
x_test_vect = vectorizer.transform(x_test)

## Model Selection 

- **We are using Logistic regression Model**