In [1]:
import pandas as pd
import re
import html
from langdetect import detect
import contractions


In [2]:
## RUN ON ipython
# import nltk
# nlth.download()

from nltk.corpus import stopwords
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Data Cleaning

- Removing Emotions
- Removing URLs
- Removing unicode Characters
- Removing hashtag and mentions
- Remvoing contractions and **stopwords**
- Removing special characters


In [3]:
def remove_emojis(text):
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642" 
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
# def remove_stopwords(text):
#     stop_words = set(stopwords.words('english'))
# #     words = word_tokenize(text)
#     filtered_text = [word for word in text if word.lower() not in stop_words]
#     return ' '.join(filtered_text)

def preprocess_text(text):
    # Convert HTML entities back to their original characters
    text = html.unescape(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text=contractions.fix(text)
    
    # Remove unicode characters
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#topic)
    text = re.sub(r'#\w+', '', text)
    
    #remove special Characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove emojis
    text = remove_emojis(text)

    # Convert text to lowercase
    text = text.lower()
    
    text = ' '.join(word for word in text.split() if not word.startswith('pictwittercom'))

#     text=remove_stopwords(text)
    
    return text






## Considering only English texts

In [4]:

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

## Balancing Number of rows belonging to each label

In [5]:

def balance_dataframe(df, binary_column, text_column):
    # Step 1: Separate the DataFrame based on binary_column values
    df_0 = df[df[binary_column] == 0]
    df_1 = df[df[binary_column] == 1]

    # Step 2: Sort both DataFrames based on the length of text_column in descending order
    df_0 = df_0.assign(text_length=df_0[text_column].str.len())
    df_1 = df_1.assign(text_length=df_1[text_column].str.len())
    df_0 = df_0.sort_values(by='text_length', ascending=False)
    df_1 = df_1.sort_values(by='text_length', ascending=False)

    # Step 3: Determine which value has more occurrences
    num_0 = df_0.shape[0]
    num_1 = df_1.shape[0]
    if num_0 > num_1:
        # Step 4: Slice the larger DataFrame to match the number of occurrences of the less frequent value
        df_0 = df_0.iloc[:num_1]
    else:
        df_1 = df_1.iloc[:num_0]

    # Step 5: Concatenate the DataFrames back together
    df_balanced = pd.concat([df_0, df_1], ignore_index=True)

    # Drop unnecessary columns
    df_balanced = df_balanced.drop(['text_length', text_column], axis=1)

    return df_balanced


## Function For triggering pre-processing

In [6]:
def preprocess_and_balance_dataframe(df):
    # Preprocess the text in 'message' column and create 'cleaned_text' column
    df['cleaned_text'] = df['message'].apply(preprocess_text)

    # Filter rows where 'cleaned_text' is in English
    df = df[df['cleaned_text'].apply(is_english)]

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Balance the DataFrame based on 'label' column and 'message' column
    df = balance_dataframe(df, 'label', 'message')

    return df

## Running for all three datasets

In [34]:
df1=pd.read_csv('sentiment_tweets3.csv')
for i,r in df1.iterrows():
    print(r['message'])
    print()
    print(preprocess_text(r['message']))
    print()
# print(df1['label'].value_counts())
# df1=df1.drop('Unnamed: 0',axis=1)
# df1=preprocess_and_balance_dataframe(df1)
# print(df1['label'].value_counts())
# df1

just had a real good moment. i missssssssss him so much, 

just had a real good moment i missssssssss him so much

is reading manga  http://plurk.com/p/mzp1e

is reading manga

@comeagainjen http://twitpic.com/2y2lx - http://www.youtube.com/watch?v=zoGfqvh2ME8 



@lapcat Need to send 'em to my accountant tomorrow. Oddly, I wasn't even referring to my taxes. Those are supporting evidence, though. 

need to send them to my accountant tomorrow oddly i was not even referring to my taxes those are supporting evidence though

ADD ME ON MYSPACE!!!  myspace.com/LookThunder

add me on myspace myspacecomlookthunder

so sleepy. good times tonight though 

so sleepy good times tonight though

@SilkCharm re: #nbn as someone already said, does fiber to the home mean we will all at least be regular now 

re as someone already said does fiber to the home mean we will all at least be regular now

23 or 24ï¿½C possible today. Nice 

23 or 24c possible today nice

nite twitterville  workout in the am  -

youtube was a big help. 

youtube was a big help

 Travel is sorted! Went into town - and found National Express had disappeared :S Thankyou porn for the internet! I now have an e-ticket!

travel is sorted went into town and found national express had disappeared s thankyou porn for the internet i now have an eticket

Not long till LONDON BABY!!!  and then the EMIRATES ON SUNDAY!!!!  YEAH!! cant wait... 

not long till london baby and then the emirates on sunday yeah cannot wait

Sorting out my car insurance. Lady on phone is currently trying to get me a better deal from her boss!! 

sorting out my car insurance lady on phone is currently trying to get me a better deal from her boss

OPEN INVITE TO MY TWEEPS: My degree show @ LSAD clare st. limerick 13th June 6pm, meet up get pissed  all welcome let me know!

open invite to my tweeps my degree show lsad clare st limerick 13th june 6pm meet up get pissed all welcome let me know

@nuttycow even harsher !! 

even harsher

@GADataGuy You t

ICU EDWARD CULLEN WEARING TENTH DOCTORS BLUE SUIT!  

icu edward cullen wearing tenth doctors blue suit

Just received the best text in my life from Mat! I'm officially in LOVE!  lol A*

just received the best text in my life from mat i am officially in love lol a

at church. blaring dallas green's voice in my car. 

at church blaring dallas greens voice in my car

Wow, fell in love with Sly Cooper and Ratchet. Well, I know what I'm buying on PS3 now 

wow fell in love with sly cooper and ratchet well i know what i am buying on ps3 now

I guess bed would be appropriate.... school tomorrow 

i guess bed would be appropriate school tomorrow

@mrskutcher Have you ever gotten a tarot reading?  I'd love to read for you sometime.  Thanks so much for your work, btw.    Best!

have you ever gotten a tarot reading i would love to read for you sometime thanks so much for your work by the way best

@WilliamPromoter did you like terminator? 

did you like terminator

@aplynch either job title woul

http://twitpic.com/6qm0m - Let us all go! 

let us all go

Im on my way to the videoshoot!, get down there now! try get there before 3pm!...Support the hustle!  COVENT GARDEN STATION!

i am on my way to the videoshoot get down there now try get there before 3pmsupport the hustle covent garden station

Starting my day off EARLY! Lets see what God has planned for me today, NO foolishness I hope. 

starting my day off early let us see what god has planned for me today no foolishness i hope

#downwith  @WrenTheDoll filling my whole page of tweets when i log on.. LOL jk girly 

filling my whole page of tweets when i log on lol jk girly

wooo everyone come to little lehigh parkway and support @msarro and I in our first race! 

wooo everyone come to little lehigh parkway and support and i in our first race

@DeLoresPressley We need to coordinate a massive TweetUp for all speakers on twitter (maybe motivation for more to get on board!) 

we need to coordinate a massive tweetup for all speakers

Now that's a job  Sr. Web Developer *Front End* at Playboy http://tr.im/oDFS

now that is a job sr web developer front end at playboy

@DitaVonTeese it's a classic 

it is a classic

My brother has officially slept for 12 hours straight and still counting 

my brother has officially slept for 12 hours straight and still counting

@netmogul and they clapped 

and they clapped

Not sure exactly what I've won with this competition... Either way I've still won something... Its all good 

not sure exactly what i have won with this competition either way i have still won something its all good

Really crazy  Look: http://tinyurl.com/lu6bfv

really crazy look

@elyse thanks!  I sometimes have to control myself to stop eating xD

thanks i sometimes have to control myself to stop eating xd

@bassiee Excuse me? 

excuse me

@lotteduncan great look forward to it 

great look forward to it

@joegreenz - absolutely loved your children's book  Have you published it? And thanks for the link to the ba

I'm sure no one's said this to you, but you should do yoga to improve your depression

i am sure no ones said this to you but you should do yoga to improve your depression

depression is real

depression is real

I'm always searching for ways to encourage myself. I never heard this song it is beautiful! I dedicate this to all who struggle with depression!  https://www.youtube.com/attribution_link?a=K_wJljuT7as&u=%2Fwatch%3Fv%3D3b3TqoE7te4%26feature%3DshareÂ â¦

i am always searching for ways to encourage myself i never heard this song it is beautiful i dedicate this to all who struggle with depression

I like Kirk Fanklin's songs because he regularly talks about depression and doesn't find a way to sneak in homophobia

i like kirk fanklins songs because he regularly talks about depression and does not find a way to sneak in homophobia

#WellbeingWednesday  Harvard School of Public Health found that females who drink a minimum of four cups of coffee per day could lower the risk of depr

In [17]:
df2=pd.read_csv('tweets_combined.csv')
print(df2['target'].value_counts())
df2=df2.drop('Unnamed: 0',axis=1)
df2.columns=['message','label']
df2=preprocess_and_balance_dataframe(df2)
print(df2['label'].value_counts())
df2

target
0    2357
1     843
Name: count, dtype: int64
label
0    679
1    679
Name: count, dtype: int64


Unnamed: 0,label,cleaned_text
0,0,feeling depressed know alone know see hear sup...
1,0,erosion soul caused people positions power liv...
2,0,depression may carry feeling worthlessness fee...
3,0,going keep banging cos true focus get stop tel...
4,0,going keep banging cos true focus get stop tel...
...,...,...
1353,1,working new years eve
1354,1,forgot cheese cake work
1355,1,one sec
1356,1,god please help


In [31]:
df3=pd.read_excel('Twitter_Non-Advert.xlsx')
print(df3['label'].value_counts())
df3.columns=['message','label']
df3=preprocess_and_balance_dataframe(df3)
print(df3['label'].value_counts())
df3

 speak-no-evil monkey Can I Be Honest With You? globe showing Europe-Africa telephone 03453192666 e-mail clinic.co.uk therapy help NLP CBT hypnotherapy mentalhealth Hertfordshire anxiety stress depression confidence counselling
Frau Goebbels early signs of psychosis psychotic mentalhealth JacindaPcychotic 
A lot of work and unfulfilled tasks plunge you into extreme stress and provoke nervous tension. As a result, you overeat, trying to eat your problems. Because of overeating and stress, you can not sleep, and a full rest starts this chain in a new circle. 
Private health insurance delivers value for young people - particularly during the pandemic, when mental health care is paramount. privatehealth mentalhealth 
XpertOnline offers you the convenience of viewing your patient’s medical report and treatment history, while consulting them online. . . . Personalised medicine telemedicine healthcare mentalhealth healthylifestyle doctors wellness dermatology 
The September issue is out! Expl

In [22]:
df4=pd.read_excel('Reddit.xlsx')
df4=df4.drop(['title','body'],axis=1)
df4.columns=['message','label']
df4=preprocess_and_balance_dataframe(df4)
print(df4['label'].value_counts())
df4

<class 'numpy.int64'>
0    376
1    376
Name: label, dtype: int64


Unnamed: 0,label,cleaned_text
0,0,update i am probably getting back together wit...
1,0,despite all downfalls and against everything i...
2,0,how i learnt how to love myself at the age of ...
3,0,how to be happy instead of miserable when you ...
4,0,the day i truly felt like i became a dad so to...
...,...,...
747,1,staying stressed i am so stressed more than no...
748,1,i have school tomorrow and i am so fucking tir...
749,1,can i be treated for depression without admitt...
750,1,i have no sleep ever since i got in my second ...


## Saving the combines results

In [23]:
result_concat_row = pd.concat([df4], axis=0)
print(result_concat_row['label'].value_counts())

result_concat_row=result_concat_row.drop_duplicates()
print(result_concat_row['label'].value_counts())

result_concat_row.to_csv('Cleaned_Reddit.csv',index=False)

0    376
1    376
Name: label, dtype: int64
0    376
1    376
Name: label, dtype: int64
