### Part 1: Text Collection and Loading
Tweets Dataset: https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
import pandas as pd

# Load CSV file
dataset_path = "Dataset.csv"
df = pd.read_csv(dataset_path)

# Display the first few rows
print(df["Tweet"].head(15))


0     @switchfoot http://twitpic.com/2y1zl - Awww, t...
1     is upset that he can't update his Facebook by ...
2     @Kenichan I dived many times for the ball. Man...
3       my whole body feels itchy and like its on fire 
4     @nationwideclass no, it's not behaving at all....
5                         @Kwesidei not the whole crew 
6                                           Need a hug 
7     @LOLTrish hey  long time no see! Yes.. Rains a...
8                  @Tatiana_K nope they didn't have it 
9                             @twittera que me muera ? 
10          spring break in plain city... it's snowing 
11                           I just re-pierced my ears 
12    @caregiving I couldn't bear to watch it.  And ...
13    @octolinz16 It it counts, idk why I did either...
14    @smarrison i would've been the first, but i di...
Name: Tweet, dtype: object


### Text Preprocessing

In [2]:
import nltk
from nltk.corpus import gutenberg

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenize(text):
    """
    Tokenizes the input text into sentences and words.

    Args:
    text (str): The text to be tokenized.

    Returns:
    list: A list of words in the text.
    """
    # Sentence tokenization
    sentences = sent_tokenize(text)
    # Word tokenization
    words = word_tokenize(text)
    return words

In [5]:
from nltk.corpus import stopwords

def remove_stopwords(words):
    """
    Removes stop words from a list of words.
    
    Input:
    - words (list): List of words from which stop words should be removed.
    
    Output:
    - filtered_words (list): List of words with stop words removed.
    """
    # Get the set of English stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words from the list
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    return filtered_words


In [6]:
from nltk.stem import PorterStemmer

def stem(words):
    """
    Applies stemming to a list of words, reducing them to their root forms.

    Args:
    words (list): A list of words to be stemmed.

    Returns:
    list: A list of stemmed words.
    """
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

In [23]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download the required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(words):
    """
    Applies lemmatization to a list of words, reducing them to their base forms.

    Args:
    words (list): A list of words to be lemmatized.

    Returns:
    list: A list of lemmatized words.
    """
    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Get POS tags for the words
    pos_tags = pos_tag(words)
    
    # Apply lemmatization to each word with its POS tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    
    return lemmatized_words


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
def preprocess_tweet(tweet):
    """
    Applies a series of preprocessing steps to a tweet.
    
    Input:
    - tweet (str): The tweet text to be preprocessed.
    
    Output:
    - preprocessed (str): Preprocessed text.
    """
    # Tokenization
    words = tokenize(tweet)
    
    # Stop Word Removal
    words_no_stopwords = remove_stopwords(words)
    
    # Stemming    
    stemmed_words = stem(words_no_stopwords)

    # Lemmatization
    lemmatized_words = lemmatize(words_no_stopwords)
    
    # Return after all preprocessing
    return ' '.join(lemmatized_words)

# Apply preprocessing to the 'Tweet' column
# df['Preprocessed'] = df['Tweet'][:100].apply(preprocess_tweet)
print(df['Tweet'][:100].apply(preprocess_tweet))
print(lemmatize(['behaving']))

# print(df.head())


0     @ switchfoot http : //twitpic.com/2y1zl - Awww...
1     upset ca n't update Facebook texting ... might...
2     @ Kenichan dive many time ball . Managed save ...
3                       whole body feel itchy like fire
4     @ nationwideclass , 's behaving . 'm mad . ? c...
                            ...                        
95    Strider sick little puppy http : //apps.facebo...
96    rylee , grace ... wana go steve 's party ? ? S...
97    hey , actually one bracket pool ! bad n't one ...
98                   @ stark n't follow , either work !
99    bad nite favorite team : Astros Spartans lose ...
Name: Tweet, Length: 100, dtype: object
['behaving']
