# Notebook for importing personal csv

### Importing libraries

In [89]:
import pandas as pd
import numpy as np
import sys
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from translate import Translator 
import os

### Check version of Pandas and version of Python

These should be 1.0.3 for Pandas and I am using 3.7.5 for Python because of SKlearn compatibilities. However, 3.8 should also work.

In [64]:
print('Python version is', sys.version)
print('Pandas version is', pd.__version__)

Python version is 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
Pandas version is 1.0.3


### Importing the CSV

Please define your path in the variable.

In [86]:
#path = r'C:\Users\20193815\Documents\Data Challenge\Full_basic.csv' 
#path = r'C:\Users\20193635\Documents\Data Science Year 1\Q4\Data Challenge\Fulls csv\Full_basic.csv' 
path = r'C:\Users\20193635\Documents\Data Science Year 1\Q4\text.csv'


In [66]:
df = pd.read_csv(path)
print(df.dtypes)


  interactivity=interactivity, compiler=compiler, result=result)


_id        object
id_str    float64
text       object
dtype: object


### Cleaning up the df

First we remove the _id column if present, then all the missing values. Afterwards we convert columns that are present to the correct dtype.

In [67]:
def cleanup(df):
    """Removes unnecessary columns and converts to the correct dtype"""
    try:
        df.drop('_id', axis = 1, inplace = True)
    except:
        print('_id not found')
    
    #Remove rows where id has a missing value
    not_number = df[df['id_str'].isna()]
    lst = not_number.index.array
    df.drop(lst, inplace = True)
    col_names = list(df.columns)
    
    string = ['text', 'lang', 'in_reply_to_status_id_str'] # Not all columnsnames have to be present in the used data set
    integer = ['id_str','user.id_str', 'in_reply_to_status_id']
    
    
    for col in col_names:
        if col == 'created_at':
            df[col] = df[col].astype(str)
            print('created_at converted to string, use timestamp for datetime dtype')
        if col == 'timestamp_ms':
            df[col] = pd.to_datetime(df[col], unit = 'ms')
            print('timestamp converted to datetime dtype')
        if col == 'id':
            print('Skip id, use id_str instead')
        if col in integer:
            df[col] = df[col].astype(pd.Int64Dtype())
        if col in string:
            df[col] = df[col].astype(str)

    return df

df = cleanup(df)
print('Dtypes after transformation:\n',df.dtypes)
df.head()




Dtypes after transformation:
 id_str     Int64
text      object
dtype: object


Unnamed: 0,id_str,text
0,1131172858951024640,La ruta de easyJet entre Londres y Menorca tra...
1,1131172864147808256,RT @bttr_as1: @goody_tracy Here’s a list of so...
2,1131172867985485824,@British_Airways
3,1131172909463027712,RT @TheRaceRadio: Nice change by @AmericanAir....
4,1131172975682605056,RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...


While string columns show up as having an object dtype, this is not actually the case. When retrieving a value from the column and checking the dtype it is a string. There might be some values in those columns that cause this issue.

In [68]:
df.reset_index(inplace = True, drop = True )


# partial data to test

# Change slice !!!

In [69]:
df_backup = df.copy()    # do not run this after the first time (you will lose data)
df = df.iloc[0:100]    # change slice !!!

In [70]:
#df.head()

# Start sentiment analysis

In [71]:
analyser = SentimentIntensityAnalyzer()  
translator = Translator(to_lang = 'english')

In [72]:
def sentiment_analyzer_scores(text, engl=True):
    if engl:                                     # if the leng is engl it skips the translation
        trans = text
    else:
        trans = translator.translate(text).text    # it translates the text
    score = analyser.polarity_scores(trans)     # gets the scores from the library 
    lb = score['compound']
    if lb >= 0.05:                              # divides the sentiment score into (-1,0,1) 
        return 1                                # can be changed to even more catagories
    elif (lb > -0.05) and (lb < 0.05):               
        return 0
    else:
        return -1

In [73]:
def sentiment_compound_scores(text, engl=True):
    if engl:                                     # if the leng is engl it skips the translation
        trans = text
    else:
        trans = translator.translate(text).text    # it translates the text
    score = analyser.polarity_scores(trans)     # gets the scores from the library 
    lb = score['compound']
    return lb

In [74]:
sentiment = []                        # gets a list of all the sentimental values of the tweets
for x in range(len(df)) :  
    
    sentiment.append(sentiment_analyzer_scores(str(df['text'][x])))
    
compound = []
for x in range(len(df)) :
    
    compound.append(sentiment_compound_scores(str(df['text'][x])))

In [75]:
df.insert(2 ,'sentiment', sentiment)   # inserting the sentimental values into the dataframe
df.insert(2, 'compound', compound)

In [76]:
df.sample(10)

Unnamed: 0,id_str,text,compound,sentiment
75,1131174378903937024,@TommyPoulson @Eurostar @francegalop @paris_lo...,0.7964,1
26,1131173456597405696,RT @Claire2day: @Ryanair @RyanairFlights shock...,-0.5445,-1
0,1131172858951024640,La ruta de easyJet entre Londres y Menorca tra...,0.0,0
82,1131174581182423040,"RT @ETNOWlive: Sources say @EtihadAirways, Hin...",0.6705,1
5,1131173010235375616,EasyJet was on a slippery slope when it charge...,-0.2023,-1
36,1131173625258815488,@Pelusitaaaa1 Hoi! Enkel KLM/Air France-vluch...,0.0,0
84,1131174585892839424,RT @ameyaw112: Thank you @SingaporeAir for sho...,0.8316,1
37,1131173644271587328,Yes please @Ryanair @easyJet,0.6124,1
1,1131172864147808256,RT @bttr_as1: @goody_tracy Here’s a list of so...,0.3612,1
46,1131173789356711936,RT @EquityLPNP: Hi @Everymancinema. You've ref...,0.0516,1


In [77]:
df['sentiment'].value_counts()

 1    42
 0    40
-1    18
Name: sentiment, dtype: int64

In [93]:
df.to_csv('text+sent1.csv')

In [95]:
df_1 = pd.read_csv('text+sent1.csv')

In [96]:
df_1

Unnamed: 0.1,Unnamed: 0,id_str,text,compound,sentiment
0,0,1131172858951024640,La ruta de easyJet entre Londres y Menorca tra...,0.0000,0
1,1,1131172864147808256,RT @bttr_as1: @goody_tracy Here’s a list of so...,0.3612,1
2,2,1131172867985485824,@British_Airways,0.0000,0
3,3,1131172909463027712,RT @TheRaceRadio: Nice change by @AmericanAir....,0.3400,1
4,4,1131172975682605056,RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...,0.0000,0
...,...,...,...,...,...
95,95,1131174735813832704,Did you mean @jetairways ?,0.0000,0
96,96,1131174754784829440,@Jaxon914 @AmericanAir Contact the better busi...,0.8126,1
97,97,1131174768114323456,Can always count on @AmericanAir to help you m...,0.0516,1
98,98,1131174769318125568,"On arrive fons’ comme Ja', tous high, plane co...",0.0000,0
