# Notebook for importing personal csv

### Importing libraries

In [3]:
import pandas as pd
import numpy as np
import sys
from datetime import datetime

### Check version of Pandas and version of Python

These should be 1.0.3 for Pandas and I am using 3.7.5 for Python because of SKlearn compatibilities. However, 3.8 should also work.

In [4]:
print('Python version is', sys.version)
print('Pandas version is', pd.__version__)

Python version is 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)]
Pandas version is 1.0.3


### Importing the CSV

Please define your path in the variable.

In [6]:
path = r'C:\Users\20193815\Documents\Data Challenge\mentions.csv' 
#path = r'C:\Users\20193635\Documents\Data Science Year 1\Q4\Data Challenge\Fulls csv\Full_basic.csv' 

In [7]:
df = pd.read_csv(path)
print(df.dtypes)
df.head()

_id                        object
id_str                    float64
entities.user_mentions     object
dtype: object


Unnamed: 0,_id,id_str,entities.user_mentions
0,5ec66ff9f3c74bef68594542,1.131173e+18,[]
1,5ec66ff9f3c74bef68594543,1.131173e+18,"[{""screen_name"":""bttr_as1"",""name"":""AGirlHasNoP..."
2,5ec66ff9f3c74bef68594544,1.131173e+18,"[{""screen_name"":""British_Airways"",""name"":""Brit..."
3,5ec66ff9f3c74bef68594545,1.131173e+18,"[{""screen_name"":""TheRaceRadio"",""name"":""Race Ra..."
4,5ec66ff9f3c74bef68594546,1.131173e+18,"[{""screen_name"":""sandeeprrao1991"",""name"":""VOBL..."


### Cleaning up the df

First we remove the _id column if present, then all the missing values. Afterwards we convert columns that are present to the correct dtype.

In [8]:
def cleanup(df):
    """Removes unnecessary columns and converts to the correct dtype"""
    try:
        df.drop('_id', axis = 1, inplace = True)
    except:
        print('_id not found')
    
    #Remove rows where id has a missing value
    not_number = df[df['id_str'].isna()]
    lst = not_number.index.array
    df.drop(lst, inplace = True)
    col_names = list(df.columns)
    
    string = ['text', 'lang', 'in_reply_to_status_id_str', ''] # Not all columnsnames have to be present in the used data set
    integer = ['id_str','user.id_str', 'in_reply_to_status_id']
    
    
    for col in col_names:
        if col == 'created_at':
            df[col] = df[col].astype(str)
            print('created_at converted to string, use timestamp for datetime dtype')
        if col == 'timestamp_ms':
            df[col] = pd.to_datetime(df[col], unit = 'ms')
            print('timestamp converted to datetime dtype')
        if col == 'id':
            print('Skip id, use id_str instead')
        if col in integer:
            df[col] = df[col].astype(pd.Int64Dtype())
        if col in string:
            df[col] = df[col].astype(str)

    return df

df = cleanup(df)

print('Dtypes after transformation:\n',df.dtypes)
df.head()




Dtypes after transformation:
 id_str                     Int64
entities.user_mentions    object
dtype: object


Unnamed: 0,id_str,entities.user_mentions
0,1131172858951024640,[]
1,1131172864147808256,"[{""screen_name"":""bttr_as1"",""name"":""AGirlHasNoP..."
2,1131172867985485824,"[{""screen_name"":""British_Airways"",""name"":""Brit..."
3,1131172909463027712,"[{""screen_name"":""TheRaceRadio"",""name"":""Race Ra..."
4,1131172975682605056,"[{""screen_name"":""sandeeprrao1991"",""name"":""VOBL..."


While string columns show up as having an object dtype, this is not actually the case. When retrieving a value from the column and checking the dtype it is a string. There might be some values in those columns that cause this issue.