In [85]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('tweets.csv')

# Define a function to clean the text data
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'#\w+', '', text)
    #Remove @
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove extra whitespaces
    text = text.strip()
    # Remove 'rt :' (with or without spaces around the colon)
    text = re.sub(r'\brt\s*:\s*', '', text, flags=re.IGNORECASE)
    # Remove \r\n (line breaks)
    text = re.sub(r'\r\n', ' ', text)
    # Remove Unicode placeholders like <u+...>
    text = re.sub(r'<u\+\w+>', '', text)
    # Remove all exclamation marks
    text = text.replace('!', '')
    # Remove all question marks
    text = text.replace('?', '')
    # Remove specific unwanted characters
    text = re.sub(r'<u\+>', '', text)  # Remove <u+>
    text = re.sub(r'[©\':.,|]', '', text)  # Remove :, ', ., ,, |, and ©
    # Remove extra whitespaces caused by the cleanup
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# Apply the cleaning function to the 'text' column
df['clean_text'] = df['text'].apply(clean_text)

# Remove rows with missing values (if any)
df = df.dropna(subset=['clean_text'])

# Save the cleaned data to a new CSV file
df.to_csv('tweets_cleaned.csv', index=False)

print("Data cleaned and saved to 'tweets_cleaned.csv'.")


Data cleaned and saved to 'tweets_cleaned.csv'.


In [86]:
print(df.columns)


Index(['Unnamed: 0', 'text', 'favorited', 'favoriteCount', 'replyToSN',
       'created', 'truncated', 'replyToSID', 'id', 'replyToUID',
       'statusSource', 'screenName', 'retweetCount', 'isRetweet', 'retweeted',
       'longitude', 'latitude', 'clean_text'],
      dtype='object')


In [87]:
import pandas as pd

# Specify the file path
file_path = 'tweets_cleaned.csv'

# Load the dataset
data = pd.read_csv(file_path)

# View the first few rows
data.head()


Unnamed: 0.1,Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude,clean_text
0,1,RT @mrvelstan: literally nobody:\r\nme:\r\n\r\n#AvengersEndgame https://t.co/LR9kFwfD5c,False,0,,23/4/2019 10:43,False,,1.12064e+18,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",DavidAc96,637,True,False,,,literally nobody me
1,2,"RT @agntecarter: i’m emotional, sorry!!\r\n\r\n2014 x 2019\r\n#blackwidow\r\n#captainamerica https://t.co/xcwkCMw18w",False,0,,23/4/2019 10:43,False,,1.12064e+18,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",NRmalaa,302,True,False,,,i’m emotional sorry x
2,3,saving these bingo cards for tomorrow \r\n©\r\n #AvengersEndgame https://t.co/d6For0jwRb,False,0,,23/4/2019 10:43,False,,1.12064e+18,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",jijitsuu,0,False,False,,,saving these bingo cards for tomorrow
3,4,RT @HelloBoon: Man these #AvengersEndgame ads are everywhere https://t.co/Q0lNf5eJsX,False,0,,23/4/2019 10:43,False,,1.12064e+18,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",SahapunB,23781,True,False,,,man these ads are everywhere
4,5,"RT @Marvel: We salute you, @ChrisEvans! #CaptainAmerica #AvengersEndgame https://t.co/VlPEpnXYgm",False,0,,23/4/2019 10:43,False,,1.12064e+18,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",stella22_97,13067,True,False,,,we salute you


In [88]:
# Adjust display option
pd.set_option('display.max_colwidth', None)

# View the first few rows with full text


data[['clean_text']].head(60)

Unnamed: 0,clean_text
0,literally nobody me
1,i’m emotional sorry x
2,saving these bingo cards for tomorrow
3,man these ads are everywhere
4,we salute you
5,the first non-spoiler critic reactions are here and nearly all are exceptionally positive with many prais…
6,ready to rock
7,we’re with him ‘til the end of the line
8,first reactions most emotional most epic mcu film
9,man these ads are everywhere


In [89]:
# Adjust display option
pd.set_option('display.max_colwidth', None)

# View the first few rows with full text


data[['clean_text']].tail(60)

Unnamed: 0,clean_text
14940,man these ads are everywhere
14941,honestly chris evans saying he cried six times and hemsworth saying he cried more than six times means that we’re all literal…
14942,let’s bring our a game on &amp; get this one right releases this friday book your tickets on
14943,we’re with him ‘til the end of the line
14944,man these ads are everywhere
14945,comic con they were first seen as the avengers the last time they will be together
14946,we salute you
14947,we salute you
14948,as we get closer to the we will do to make you watch the great grand finale rt an…
14949,both scarlett and brie are with the infinity stones as rings but until now just them this means something …
