# Notebook for importing personal csv

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import sys
from datetime import datetime

### Check version of Pandas and version of Python

These should be 1.0.3 for Pandas and I am using 3.7.5 for Python because of SKlearn compatibilities. However, 3.8 should also work.

In [2]:
print('Python version is', sys.version)
print('Pandas version is', pd.__version__)

Python version is 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
Pandas version is 1.0.3


### Importing the CSV

Please define your path in the variable.

In [3]:
#path = r'C:\Users\20193815\Documents\Data Challenge\Full_basic.csv' 
path = r'C:\Users\20193635\Documents\Data Science Year 1\Q4\Data Challenge\Fulls csv\Full_basic.csv' 

In [4]:
df = pd.read_csv(path)
print(df.dtypes)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


_id              object
created_at       object
id               object
id_str          float64
text             object
user.id_str     float64
lang             object
timestamp_ms    float64
dtype: object


Unnamed: 0,_id,created_at,id,id_str,text,user.id_str,lang,timestamp_ms
0,5ec66ff9f3c74bef68594542,Wed May 22 12:20:00 +0000 2019,1.13117e+18,1.131173e+18,La ruta de easyJet entre Londres y Menorca tra...,393374100.0,es,1558528000000.0
1,5ec66ff9f3c74bef68594543,Wed May 22 12:20:01 +0000 2019,1.13117e+18,1.131173e+18,RT @bttr_as1: @goody_tracy Here’s a list of so...,3420691000.0,en,1558528000000.0
2,5ec66ff9f3c74bef68594544,Wed May 22 12:20:02 +0000 2019,1.13117e+18,1.131173e+18,@British_Airways,394376600.0,und,1558528000000.0
3,5ec66ff9f3c74bef68594545,Wed May 22 12:20:12 +0000 2019,1.13117e+18,1.131173e+18,RT @TheRaceRadio: Nice change by @AmericanAir....,36488560.0,en,1558528000000.0
4,5ec66ff9f3c74bef68594546,Wed May 22 12:20:28 +0000 2019,1.13117e+18,1.131173e+18,RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...,14193350.0,en,1558528000000.0


### Cleaning up the df

First we remove the _id column if present, then all the missing values. Afterwards we convert columns that are present to the correct dtype.

In [7]:
def cleanup(df):
    """Removes unnecessary columns and converts to the correct dtype"""
    try:
        df.drop('_id', axis = 1, inplace = True)
    except:
        print('_id not found')
    
    #Remove rows where id has a missing value
    not_number = df[df['id_str'].isna()]
    lst = not_number.index.array
    df.drop(lst, inplace = True)
    col_names = list(df.columns)
    
    string = ['text', 'lang', 'in_reply_to_status_id_str'] # Not all columnsnames have to be present in the used data set
    integer = ['id_str','user.id_str', 'in_reply_to_status_id']
    
    
    for col in col_names:
        if col == 'created_at':
            df[col] = df[col].astype(str)
            print('created_at converted to string, use timestamp for datetime dtype')
        if col == 'timestamp_ms':
            df[col] = pd.to_datetime(df[col], unit = 'ms')
            print('timestamp converted to datetime dtype')
        if col == 'id':
            print('Skip id, use id_str instead')
        if col in integer:
            df[col] = df[col].astype(pd.Int64Dtype())
        if col in string:
            df[col] = df[col].astype(str)

    return df

df = cleanup(df)
print('Dtypes after transformation:\n',df.dtypes)
df.head()




_id not found
created_at converted to string, use timestamp for datetime dtype
Skip id, use id_str instead
timestamp converted to datetime dtype
Dtypes after transformation:
 created_at              object
id                      object
id_str                   Int64
text                    object
user.id_str              Int64
lang                    object
timestamp_ms    datetime64[ns]
dtype: object


Unnamed: 0,created_at,id,id_str,text,user.id_str,lang,timestamp_ms
0,Wed May 22 12:20:00 +0000 2019,1.13117e+18,1131172858951024640,La ruta de easyJet entre Londres y Menorca tra...,393374091,es,2019-05-22 12:20:00.406
1,Wed May 22 12:20:01 +0000 2019,1.13117e+18,1131172864147808256,RT @bttr_as1: @goody_tracy Here’s a list of so...,3420691215,en,2019-05-22 12:20:01.645
2,Wed May 22 12:20:02 +0000 2019,1.13117e+18,1131172867985485824,@British_Airways,394376606,und,2019-05-22 12:20:02.560
3,Wed May 22 12:20:12 +0000 2019,1.13117e+18,1131172909463027712,RT @TheRaceRadio: Nice change by @AmericanAir....,36488556,en,2019-05-22 12:20:12.449
4,Wed May 22 12:20:28 +0000 2019,1.13117e+18,1131172975682605056,RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...,14193348,en,2019-05-22 12:20:28.237


While string columns show up as having an object dtype, this is not actually the case. When retrieving a value from the column and checking the dtype it is a string. There might be some values in those columns that cause this issue.