In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))

Saving Tweets.zip to Tweets.zip
User uploaded file "Tweets.zip" with length 3608147 bytes


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))

Saving Tweets.xlsx to Tweets.xlsx
User uploaded file "Tweets.xlsx" with length 5211226 bytes


Step 2: Load the Excel Data into Pandas DataFrame

In [None]:
import pandas as pd

# Assuming the first sheet in the Excel file contains your data
df = pd.read_excel(fn)  # Replace 'fn' with the filename if not iterating

Step 3: Preprocessing Steps

In [None]:
#Convert Text to Lowercase
df['text'] = df['text'].str.lower()

#Remove Special Non-ASCII Characters
df['text'] = df['text'].str.encode('ascii', 'ignore').str.decode('ascii')



In [None]:
#Handling NaN Values

import numpy as np

# Assuming df is your DataFrame after reading the Excel file
# Check for NaN values in 'text' column and replace them with empty string
df['text'] = df['text'].apply(lambda x: '' if pd.isna(x) else x)


In [None]:
#Remove Stopwords using NLTK

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))


In [None]:
#Remove Emoticons, Retweets, Favorites, Hashtags, URLs, and '@' Usernames

import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    # Remove hashtags
    text = re.sub(r'#', '', text)
    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
#Remove Duplicate Entries

df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
#Tokenize using NLTK

import nltk
nltk.download('punkt')  # Download the NLTK tokenizer data

# Assuming you have already read your Excel file into df and performed initial preprocessing
from nltk.tokenize import word_tokenize

# Tokenize 'text' column
df['tokens'] = df['text'].apply(word_tokenize)

# Continue with other preprocessing steps as needed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Convert Gender Values to Binary

df['gender'] = df['gender'].apply(lambda x: 1 if x.lower() == 'male' else 0)

In [None]:
#Lemmatize Words using NLTK's WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])


In [None]:
#Remove Numerical Data and Extra Whitespaces
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [None]:
#Saving the Preprocessed Data

output_filename = 'preprocessed_data.xlsx'
df.to_excel(output_filename, index=False)