In [3]:
import pandas as pd
import numpy as np

In [4]:
class Data_Preparation():
    """
        Preparação dos dados
    """
    def __init__(self):
        path = r'../../../../data/DataPrepFinal.csv'
        self.df = pd.read_csv(
            filepath_or_buffer=path,
            sep=',',
            decimal='.'
        )

        path = r'../../../../data/campaign.csv'
        self.df_descrição = pd.read_csv(
            filepath_or_buffer=path,
            sep=',',
            decimal='.'
        )

        path = r'../../../../data/invested.csv'
        self.df_invested = pd.read_csv(
            filepath_or_buffer=path,
            sep=',',
            decimal='.'
        )

    def dataframe(self):
        df = self.df.head(5)
        display(df)

    def excluindo_colunas(self):
        self.df = self.df.drop(
            columns=[
                'Unnamed: 0',
                'name',
                'category',
                'goal',
                'pledged',
                'usd pledged',
            ]
        )

        display(self.df)

    def removendo_a_string_usd_da_coluna_usd_goal_real(self):
        self.df['usd_goal_real'] = self.df['usd_goal_real'].apply(
            lambda x: x.split('USD ')
        )

        display(self.df)

    def removendo_a_hora_e_minuto_da_coluna_launched(self):
        self.df['launched'] = self.df['launched'].str.replace('\d\d:\d\d', '', regex=True)

        display(self.df)

    def removendo_a_hora_e_minuto_da_coluna_launched(self):
        self.df['launched'] = self.df['launched'].str.replace(' \d\d:\d\d', '', regex=True)

        display(self.df)

    def convertendo_a_coluna_usd_goal_real_para_int64(self):
        self.df['usd_goal_real'] = self.df['usd_goal_real'].astype('int64')

        display(self.df)

    def tipos_presentes_no_dataframes(self):
        df = self.df.dtypes

        display(df)

    def convertendo_a_coluna_deadline_launched_para_o_tipo_datetime(self):
        self.df['deadline'] = pd.to_datetime(
            arg=self.df['deadline'],
            format='%d/%m/%y'
        )

        self.df['launched'] = pd.to_datetime(
            arg=self.df['launched'],
            format='%d/%m/%y'
        )

    def criando_a_coluna_do_intervalo_de_dias(self):
        self.df['time_range'] = (self.df['deadline'] - self.df['launched']).dt.days

        display(self.df)

    def verificando_valores_unicos_no_df_descrição(self):
        df = self.df_descrição['Text Description'].unique()

        display(df)

    def excluindo_a_coluna_text_description_do_df_descrição(self):
        self.df_descrição = self.df_descrição.drop(
            columns=['Text Description',]
        )

        display(self.df_descrição)

    def mesclando_dataframes_descrição(self):
        self.df = self.df.merge(
            right=self.df_descrição,
            how='right',
            on=['ID']
        )

        display(self.df)

    def dataframe_investimentos(self):
        df = self.df_invested

        display(df)

    def verificando_ids_duplicados(self):
        df = self.df_invested['ID'].duplicated().sum()
        display(df)

    def transformando_a_coluna_backedLocation_em_booleanos(self):
        self.df_invested = pd.get_dummies(
            data=self.df_invested,
            columns=['backedLocation']
        )

        display(self.df_invested)

    def agrupando_o_df_invested_por_apoiadores_em_cada_região(self):
        self.df_invested = self.df_invested.groupby(by=['ID']).agg(
            {
                'age':'mean',
                'backedLocation_BR':'sum',
                'backedLocation_US':'sum',
                'backedLocation_GBK':'sum',
            }
        ).reset_index()

        display(self.df_invested)

    def mesclando_dataframes_invested(self):
        self.df = self.df.merge(
            right=self.df_invested,
            how='right',
            on=['ID']
        )

        display(self.df)

if __name__ == "__main__":
    data_preparation = Data_Preparation()

In [5]:
data_preparation.dataframe()

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,09/10/15,100000,11/08/15 12:12,0,failed,0,GB,0.0,0,USD 153395
1,1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,01/11/17,3000000,02/09/17 04:43,242100,failed,15,US,10000.0,242100,USD 3000000
2,2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,26/02/13,4500000,12/01/13 00:20,22000,failed,3,US,22000.0,22000,USD 4500000
3,3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,16/04/12,500000,17/03/12 03:24,100,failed,1,US,100.0,100,USD 500000
4,4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,29/08/15,1950000,04/07/15 08:35,128300,canceled,14,US,128300.0,128300,USD 1950000


In [6]:
data_preparation.excluindo_colunas()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real
0,1000002330,Publishing,GBP,09/10/15,11/08/15 12:12,failed,0,GB,0,USD 153395
1,1000003930,Film & Video,USD,01/11/17,02/09/17 04:43,failed,15,US,242100,USD 3000000
2,1000004038,Film & Video,USD,26/02/13,12/01/13 00:20,failed,3,US,22000,USD 4500000
3,1000007540,Music,USD,16/04/12,17/03/12 03:24,failed,1,US,100,USD 500000
4,1000011046,Film & Video,USD,29/08/15,04/07/15 08:35,canceled,14,US,128300,USD 1950000
...,...,...,...,...,...,...,...,...,...,...
19458,1098725026,Photography,USD,25/09/15,27/07/15 20:17,failed,0,US,0,USD 1000000
19459,1098726927,Publishing,USD,07/11/14,06/10/14 20:45,failed,7,US,6400,USD 550000
19460,1098729640,Music,USD,30/04/11,31/03/11 18:22,failed,21,US,80000,USD 727500
19461,10987305,Film & Video,USD,17/06/14,18/04/14 22:36,failed,6,US,22000,USD 1000000


In [7]:
data_preparation.removendo_a_string_usd_da_coluna_usd_goal_real()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real
0,1000002330,Publishing,GBP,09/10/15,11/08/15 12:12,failed,0,GB,0,"[, 153395]"
1,1000003930,Film & Video,USD,01/11/17,02/09/17 04:43,failed,15,US,242100,"[, 3000000]"
2,1000004038,Film & Video,USD,26/02/13,12/01/13 00:20,failed,3,US,22000,"[, 4500000]"
3,1000007540,Music,USD,16/04/12,17/03/12 03:24,failed,1,US,100,"[, 500000]"
4,1000011046,Film & Video,USD,29/08/15,04/07/15 08:35,canceled,14,US,128300,"[, 1950000]"
...,...,...,...,...,...,...,...,...,...,...
19458,1098725026,Photography,USD,25/09/15,27/07/15 20:17,failed,0,US,0,"[, 1000000]"
19459,1098726927,Publishing,USD,07/11/14,06/10/14 20:45,failed,7,US,6400,"[, 550000]"
19460,1098729640,Music,USD,30/04/11,31/03/11 18:22,failed,21,US,80000,"[, 727500]"
19461,10987305,Film & Video,USD,17/06/14,18/04/14 22:36,failed,6,US,22000,"[, 1000000]"


In [8]:
data_preparation.removendo_a_hora_e_minuto_da_coluna_launched()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real
0,1000002330,Publishing,GBP,09/10/15,11/08/15,failed,0,GB,0,"[, 153395]"
1,1000003930,Film & Video,USD,01/11/17,02/09/17,failed,15,US,242100,"[, 3000000]"
2,1000004038,Film & Video,USD,26/02/13,12/01/13,failed,3,US,22000,"[, 4500000]"
3,1000007540,Music,USD,16/04/12,17/03/12,failed,1,US,100,"[, 500000]"
4,1000011046,Film & Video,USD,29/08/15,04/07/15,canceled,14,US,128300,"[, 1950000]"
...,...,...,...,...,...,...,...,...,...,...
19458,1098725026,Photography,USD,25/09/15,27/07/15,failed,0,US,0,"[, 1000000]"
19459,1098726927,Publishing,USD,07/11/14,06/10/14,failed,7,US,6400,"[, 550000]"
19460,1098729640,Music,USD,30/04/11,31/03/11,failed,21,US,80000,"[, 727500]"
19461,10987305,Film & Video,USD,17/06/14,18/04/14,failed,6,US,22000,"[, 1000000]"


In [9]:
# data_preparation.convertendo_a_coluna_usd_goal_real_para_int64()

In [10]:
data_preparation.tipos_presentes_no_dataframes()

ID                   int64
main_category       object
currency            object
deadline            object
launched            object
state               object
backers              int64
country             object
usd_pledged_real     int64
usd_goal_real       object
dtype: object

In [11]:
data_preparation.convertendo_a_coluna_deadline_launched_para_o_tipo_datetime()

In [12]:
data_preparation.criando_a_coluna_do_intervalo_de_dias()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real,time_range
0,1000002330,Publishing,GBP,2015-10-09,2015-08-11,failed,0,GB,0,"[, 153395]",59
1,1000003930,Film & Video,USD,2017-11-01,2017-09-02,failed,15,US,242100,"[, 3000000]",60
2,1000004038,Film & Video,USD,2013-02-26,2013-01-12,failed,3,US,22000,"[, 4500000]",45
3,1000007540,Music,USD,2012-04-16,2012-03-17,failed,1,US,100,"[, 500000]",30
4,1000011046,Film & Video,USD,2015-08-29,2015-07-04,canceled,14,US,128300,"[, 1950000]",56
...,...,...,...,...,...,...,...,...,...,...,...
19458,1098725026,Photography,USD,2015-09-25,2015-07-27,failed,0,US,0,"[, 1000000]",60
19459,1098726927,Publishing,USD,2014-11-07,2014-10-06,failed,7,US,6400,"[, 550000]",32
19460,1098729640,Music,USD,2011-04-30,2011-03-31,failed,21,US,80000,"[, 727500]",30
19461,10987305,Film & Video,USD,2014-06-17,2014-04-18,failed,6,US,22000,"[, 1000000]",60


In [13]:
data_preparation.verificando_valores_unicos_no_df_descrição()

array([1])

In [14]:
data_preparation.excluindo_a_coluna_text_description_do_df_descrição()

Unnamed: 0,ID,Video,Image,Infographic,Reviews,Risks
0,1000002330,1,1,1,0,1
1,1000003930,1,1,1,1,0
2,1000004038,0,1,1,0,1
3,1000007540,1,0,1,0,0
4,1000011046,0,1,0,0,0
...,...,...,...,...,...,...
19458,1098725026,0,0,0,0,1
19459,1098726927,1,0,1,0,0
19460,1098729640,1,1,1,1,1
19461,10987305,1,0,0,0,0


In [15]:
data_preparation.mesclando_dataframes_descrição()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real,time_range,Video,Image,Infographic,Reviews,Risks
0,1000002330,Publishing,GBP,2015-10-09,2015-08-11,failed,0,GB,0,"[, 153395]",59,1,1,1,0,1
1,1000003930,Film & Video,USD,2017-11-01,2017-09-02,failed,15,US,242100,"[, 3000000]",60,1,1,1,1,0
2,1000004038,Film & Video,USD,2013-02-26,2013-01-12,failed,3,US,22000,"[, 4500000]",45,0,1,1,0,1
3,1000007540,Music,USD,2012-04-16,2012-03-17,failed,1,US,100,"[, 500000]",30,1,0,1,0,0
4,1000011046,Film & Video,USD,2015-08-29,2015-07-04,canceled,14,US,128300,"[, 1950000]",56,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19458,1098725026,Photography,USD,2015-09-25,2015-07-27,failed,0,US,0,"[, 1000000]",60,0,0,0,0,1
19459,1098726927,Publishing,USD,2014-11-07,2014-10-06,failed,7,US,6400,"[, 550000]",32,1,0,1,0,0
19460,1098729640,Music,USD,2011-04-30,2011-03-31,failed,21,US,80000,"[, 727500]",30,1,1,1,1,1
19461,10987305,Film & Video,USD,2014-06-17,2014-04-18,failed,6,US,22000,"[, 1000000]",60,1,0,0,0,0


In [16]:
data_preparation.dataframe_investimentos()

Unnamed: 0,ID,backedLocation,age
0,1000003930,BR,18
1,1000004038,US,57
2,1000007540,US,43
3,1000011046,BR,29
4,1000014025,BR,77
...,...,...,...
35391,1098718520,US,68
35392,1098726927,BR,30
35393,1098729640,US,20
35394,10987305,BR,19


In [17]:
data_preparation.verificando_ids_duplicados()

18581

In [18]:
data_preparation.transformando_a_coluna_backedLocation_em_booleanos()

Unnamed: 0,ID,age,backedLocation_BR,backedLocation_GBK,backedLocation_US
0,1000003930,18,1,0,0
1,1000004038,57,0,0,1
2,1000007540,43,0,0,1
3,1000011046,29,1,0,0
4,1000014025,77,1,0,0
...,...,...,...,...,...
35391,1098718520,68,0,0,1
35392,1098726927,30,1,0,0
35393,1098729640,20,0,0,1
35394,10987305,19,1,0,0


In [19]:
data_preparation.agrupando_o_df_invested_por_apoiadores_em_cada_região()

Unnamed: 0,ID,age,backedLocation_BR,backedLocation_US,backedLocation_GBK
0,106144,66.500000,0,1,1
1,1003381,48.500000,1,1,0
2,1017454,41.500000,0,2,0
3,1024013,77.000000,2,0,0
4,1024208,71.000000,0,1,1
...,...,...,...,...,...
16810,1098710911,31.000000,2,1,0
16811,1098718520,39.333333,0,1,2
16812,1098726927,38.000000,1,2,0
16813,1098729640,41.333333,0,1,2


In [20]:
data_preparation.mesclando_dataframes_invested()

Unnamed: 0,ID,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real,time_range,Video,Image,Infographic,Reviews,Risks,age,backedLocation_BR,backedLocation_US,backedLocation_GBK
0,106144,Music,USD,2014-06-15,2014-05-16,failed,1,US,1500,"[, 180000]",30,1,1,0,0,1,66.500000,0,1,1
1,1003381,Music,USD,2012-07-19,2012-06-21,failed,30,US,115900,"[, 1000000]",28,0,1,0,1,1,48.500000,1,1,0
2,1017454,Film & Video,USD,2015-04-26,2015-04-20,successful,9,US,16700,"[, 100]",6,1,0,1,0,1,41.500000,0,2,0
3,1024013,Publishing,USD,2012-05-18,2012-04-03,failed,136,US,975805,"[, 4000000]",45,1,1,1,0,0,77.000000,2,0,0
4,1024208,Technology,USD,2014-09-11,2014-08-12,failed,5,US,15000,"[, 150000]",30,0,1,1,0,0,71.000000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,1098710911,Film & Video,USD,2014-11-06,2014-10-07,failed,26,US,187100,"[, 555000]",30,1,0,1,1,1,31.000000,2,1,0
16811,1098718520,Film & Video,USD,2014-10-01,2014-08-30,failed,2,US,3500,"[, 550000]",32,1,0,0,0,0,39.333333,0,1,2
16812,1098726927,Publishing,USD,2014-11-07,2014-10-06,failed,7,US,6400,"[, 550000]",32,1,0,1,0,0,38.000000,1,2,0
16813,1098729640,Music,USD,2011-04-30,2011-03-31,failed,21,US,80000,"[, 727500]",30,1,1,1,1,1,41.333333,0,1,2
