In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy import stats

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf

from statsmodels.stats.multicomp import pairwise_tukeyhsd

import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kucza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def ListNoDups(mylist):
    mylist = list(dict.fromkeys(mylist))
    return mylist

class CarData:
    
    missing = -1
    duplicates = -1
    
    price_outliers = -1
    mileage_outliers = -1
    year_outliers = -1
    total_discard = -1
    
    corpus = []
    def __init__(self, path, price_outlier_mt = 200000, mileage_outlier_mt = 400000,
                year_outlier_lt = 1995, engine_outlier_mt = 4000, engine_outlier_lt = 750,
                dependent_variable = 'price'):
        
        #define outliers values
        self.price_outlier_mt = price_outlier_mt
        self.mileage_outlier_mt = mileage_outlier_mt
        self.year_outlier_lt =  year_outlier_lt
        self.engine_outlier_mt = engine_outlier_mt
        self.engine_outlier_lt = engine_outlier_lt
        self.dependent_variable = dependent_variable

        
        
        #defince variable data types
        self.numeric_variables = ['price', 'mileage_km', 'engine_cm3', 'year']
        self.categorical_variables = ['engine_type', 'city', 'province']
        
        #read all .csv files from the directory
        self.data = pd.concat(map(pd.read_csv, glob.glob(os.path.join(path, "*.csv"))), sort=False)
        
        #drop the duplicates and save the number of duplicates - many duplicates due to data gathering method
        self.duplicates = len(self.data) - len(self.data.drop_duplicates())
        self.data.drop_duplicates(inplace = True)
        
        #rename columns
        self.data.columns = ['title', 'price', 'sub_title', 'mileage_km', 'year', 'engine_cm3',
                'engine_type', 'city', 'province', 'negotiable']
        
        #drop NaNs and save the number of rows dropped to the missing varaible
        self.missing = self.data['engine_type'].isna().sum()
        self.data.dropna(subset = ['engine_type'], axis = 'index', inplace = True)
        
        self.missing = self.missing + self.data['city'].isna().sum()
        self.data.dropna(subset = ['city'], axis = 'index', inplace = True)
        
        self.missing = self.missing + self.data['engine_cm3'].isna().sum()
        self.data.dropna(subset = ['engine_cm3'], axis = 'index', inplace = True)
        
        #clean up the columns
        self.data['price'] = self.data['price'].apply(lambda x: x.replace(",", ""))
        self.data['price'] = self.data['price'].apply(lambda x: x.replace(" ", "")).astype('int')
        
        self.data['mileage_km'] = self.data['mileage_km'].apply(lambda x: x.replace("km", ""))
        self.data['mileage_km'] = self.data['mileage_km'].apply(lambda x: x.replace(" ", "")).astype('float')
        
        self.data['engine_cm3'] = self.data['engine_cm3'].astype('str')
        self.data['engine_cm3'] = self.data['engine_cm3'].apply(lambda x: x.replace('cm3', ''))
        self.data['engine_cm3'] = self.data['engine_cm3'].apply(lambda x: x.replace(' ','')).astype('int')
        
        self.data['province'] = self.data['province'].astype('str')
        self.data['province'] = self.data['province'].apply(lambda x: x.replace('(',''))
        self.data['province'] = self.data['province'].apply(lambda x: x.replace(')',''))
        
        self.data['sub_title'] = self.data['sub_title'].astype('str') #may change that in the future - possible info loss due to lowercase
        
        
        self.data['title'] = self.data['title'].astype('str') #may change that in the future - possible info loss due to lowercase
        
    
        #Add ID column
        self.data.insert(loc = 0, column = 'ID', value = range(1, len(self.data)+1))

        #discard outliers and calculate the numbers
        self.total_discard = len(self.data) - len(self.data[(self.data['price'] <= self.price_outlier_mt) &
                                                        (self.data['mileage_km'] <= self.mileage_outlier_mt) &
                                                        (self.data['year'] >= self.year_outlier_lt) &
                                                        (self.data['engine_cm3'] <= self.engine_outlier_mt) &
                                                        (self.data['engine_cm3'] >= self.engine_outlier_lt)])        
        
        self.price_outliers = len(self.data[self.data['price'] > price_outlier_mt])
        self.data = self.data[self.data['price'] <= price_outlier_mt]
        
        self.mileage_outliers = len(self.data[self.data['mileage_km'] > mileage_outlier_mt])
        self.data = self.data[self.data['mileage_km'] <= mileage_outlier_mt]
        
        self.year_outliers = len(self.data[self.data['year'] < year_outlier_lt])
        self.data = self.data[self.data['year'] >= year_outlier_lt]
        
        self.engine_outliers = len(self.data[(self.data['engine_cm3'] > engine_outlier_mt) |
                                            (self.data['engine_cm3'] < engine_outlier_lt)])
        self.data = self.data[(self.data['engine_cm3'] <= engine_outlier_mt) & 
                             (self.data['engine_cm3'] >= engine_outlier_lt)]
        
        #NLP
        self.data['concat_title_subtitle'] = self.data['title'] + ' ' + self.data['sub_title']
        self.data['concat_title_subtitle'] = self.data['concat_title_subtitle'].apply(lambda x: x.lower())
        
        #replace cases for correct analysis
        self.data['concat_title_subtitle'] = self.data['concat_title_subtitle'].apply(lambda x: x.replace('+','plus'))
        self.data['concat_title_subtitle'] = self.data['concat_title_subtitle'].apply(lambda x: x.replace('(',''))
        self.data['concat_title_subtitle'] = self.data['concat_title_subtitle'].apply(lambda x: x.replace(')',''))
        
    def describe(self):
        #descriptive statistice
        desc_stats = round(pd.DataFrame(
                        data = self.data[['price', 'mileage_km', 'year', 'engine_cm3']].describe(),
                        columns = self.data[['price', 'mileage_km', 'year', 'engine_cm3']].columns),2)
        return desc_stats
    
    def outliers(self):
        #baisc data about outliers discarded during preprocessing
        print('Offers with price greater than '+str(self.price_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.price_outliers))
        print('')
        print('Offers with mileage greater than '+str(self.mileage_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.mileage_outliers))
        print('')
        print('Offers with year lower than '+str(self.year_outlier_lt)+' have been discarded')
        print('The number of such offers = '+str(self.year_outliers))
        print('')
        print('Offers with engine_cm3 greater than '+str(self.engine_outlier_lt)+
              ' and lower than '+str(self.engine_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.engine_outliers))
        print('')
        print('Total number of discarded offers = '+str(self.total_discard)
              +'('+str(round(self.total_discard/len(self.data)*100,2))+'%)'
              +' - may be different to the sum of above due to overlap')
        
    def scatter_nox(self, var = 'all', figsize_1 = 7, figsize_2 = 5):
        #prints scatter plots with no x axis - a dummy sequence as x axis
        if var != 'all' and var not in self.data.columns:
            print('Variable not found in the dataset')
        if var == 'all':
            plt.rcParams["figure.figsize"] = (figsize_1,figsize_2)
            plt.scatter(y = self.data['mileage_km'], x = range(1, len(self.data)+1), s=1)
            plt.title('mileage_km')
            plt.show()

            plt.scatter(y = self.data['price'], x = range(1, len(self.data)+1), s=1)
            plt.title('price')
            plt.show()

            plt.scatter(y = self.data['year'], x = range(1, len(self.data)+1), s=1)
            plt.title('year')
            plt.show()

            plt.scatter(y = self.data['engine_cm3'], x = range(1, len(self.data)+1), s=1)
            plt.title('engine_cm3')
            plt.show()
        else:
            plt.rcParams["figure.figsize"] = (figsize_1,figsize_2)
            plt.scatter(y = self.data[var], x = range(1, len(self.data)+1), s=1)
            plt.title(var)
            plt.show()
            
    def scatter(self, var = 'all'):
        #prints scatter plots for numerical variables
        if var != 'all' and var not in self.data.columns:
            print('Variable not found in the dataset')
        list_comb = []
        if var == 'all':
            for variable1 in enumerate(self.numeric_variables):
                for variable2 in enumerate(self.numeric_variables):
                    if variable1 != variable2 and variable1[1]+variable2[1] not in list_comb and variable2[1]+variable1[1] not in list_comb:
                        plt.scatter(y = self.data[variable1[1]], x = self.data[variable2[1]], s=1)
                        plt.title("Correlation between "+variable1[1]+' and '+variable2[1])
                        plt.ylabel(variable1[1])
                        plt.xlabel(variable2[1])
                        plt.show()
                        list_comb.append(variable1[1]+variable2[1])
        #else: - TO DO
    
    def hist(self, var = 'all', bins = 50):
        if var != 'all' and var not in self.data.columns:
            print('Variable not found in the dataset')
        #prints histograms for numeric variables
        if var == 'all':
            for variable in enumerate(self.numeric_variables):
                plt.hist(x = self.data[variable[1]], bins = bins)
                plt.title(variable[1])
                plt.show()
     
    def price_cat_vars(self):
        for variable in enumerate(self.categorical_variables):
            # shows desrptive statistics of categorical variables
            print(x.data.groupby(x.data[variable[1]])['price'].describe())
            #the variables need further preprocessing
            
    def add_dummies(self, categorical_list, columns_to_check, delete_from_strings = 'yes'):
        #adds dummmies from cat_list, checks in every column of columns
        for column in enumerate(columns_to_check):
            for category in enumerate(categorical_list):
                col_name = column[1] + '_' + category[1]
                self.data[col_name] = self.data[column[1]].str.contains(cat[1]).astype('int')
                
                #append newly craeted varaibles to categorical variables
                if self.data[col_name].sum() > 0:
                    self.categorical_variables.append(col_name)
                else:
                    self.data.drop(columns = [col_name], inplace = True)
                
                #delete the string from the column
                if delete_from_strings == 'yes':
                    self.data[column] = self.data[column].apply(lambda x: x.replace(category[1], ''))
                
        return self
    
    def add_dummies2(self, categorical_list, delete_from_strings = 'yes'):
        #adds dummmies from cat_list, checks in every column of columns
        for category in enumerate(categorical_list):
            col_name = category[1]
            self.data[col_name] = self.data['concat_title_subtitle'].str.contains(category[1]).astype('int')
                
                #append newly craeted varaibles to categorical variables
            if self.data[col_name].sum() > 0:
                self.categorical_variables.append(col_name)
            else:
                self.data.drop(columns = [col_name], inplace = True)
                
                #delete the string from the column
            if delete_from_strings == 'yes':
                self.data['concat_title_subtitle'] = self.data['concat_title_subtitle'].apply(lambda x: x.replace(category[1], ''))
                
        return self
    """
    def ind_test(self, var, alpha = 0.05):
        if alpha > 1 or alpha < 0:
            print('Incorrect alpha value. Select a value from <0;1>.')
            
        if var != 'all' and var not in self.data.columns:
            print('Variable not found in the dataset')
        pivot = round(self.data.pivot_table(values = 'price', index = var, aggfunc = ['count', 'mean']),2)
        pivot.columns = ['count', 'mean']
        
        mean_price = self.data['price'].mean()
        
        pivot['sm'] = pivot['mean']/((pivot['count'])**(1/2))
        
        pivot['t'] = (pivot['mean']-mean_price)/pivot['sm']
        pivot['df'] = pivot['count']-1

        #calculate p-value
        pivot['t_border'] = stats.t.ppf(1-alpha/2, pivot['df'])
        
        #implementation here is not 100% mathematically correct
        return pivot
    """
    def anova(self, var = 'all', alpha = 0.05):
        if var == 'all':
            for variable in enumerate(self.categorical_variables):
                anova_data = self.data[[variable[1], 'price']].reset_index().copy()
                anova_data.columns = ['index', variable[1], 'price']
                equation_string = 'price ~ '+str(variable[1])
                model = ols(equation_string, data=anova_data).fit()
                anova_table = sm.stats.anova_lm(model, typ=2)
                print(anova_table)
                print()
                
                #pairwise comparisons
                pairwise_comparison = pairwise_tukeyhsd(endog = anova_data['price'],
                                                        groups = anova_data[variable[1]],
                                                        alpha = alpha)
                print(pairwise_comparison)
                print()
        #else:
            #TO DO
            #also TO DO check Anova assumptions
    def make_corpus(self):
        #creates a corpus out of title and subtitle column
        for i in range(0, len(self.data)):
            #string = re.sub('[^a-zA-Z]', ' ', self.data.reset_index().loc[i]['concat_title_subtitle'])
            string = self.data.reset_index().loc[i]['concat_title_subtitle']
            string = string.split()
            self.corpus = self.corpus + string
        self.corpus = ListNoDups(self.corpus)
        
        #with open("corpus.txt", "w") as output:
        #    output.write(str(self.corpus))
    
        return self.corpus
    
    def analyse_variables(self, list_of_variables, discard = 0.01):
    # independence tests for a list of variable e.g. corpus
        final_df = pd.DataFrame(columns = ['variable', 'mean_1', 'mean_0', 'count_1', 'count_0'])

        for variable in enumerate(list_of_variables):
            self.data[variable[1]] = self.data['concat_title_subtitle'].str.contains(variable[1]).astype('int')

            mean_1 = self.data.loc[self.data[variable[1]] == 1][self.dependent_variable].mean()
            mean_0 = self.data.loc[self.data[variable[1]] == 0][self.dependent_variable].mean()

            count_1 = len(self.data.loc[self.data[variable[1]] == 1])
            count_0 = len(self.data.loc[self.data[variable[1]] == 0])        
            
            if count_1 >= discard * len(self.data) and count_0 >= discard * len(self.data):            
                dict_to_append = {
                    'variable' : variable[1],
                    'mean_1' : mean_1,
                    'mean_0' : mean_0,
                    'count_1' : count_1,
                    'count_0' : count_0
                }

                final_df = final_df.append(dict_to_append, ignore_index = True)

            self.data.drop(columns = [variable[1]], inplace = True)

            if variable[0] % 1000 == 0:
                print(str(variable[0])+'/'+str(len(list_of_variables)))
        
        
        
        final_df['mean_diff'] = abs(final_df['mean_1'] - final_df['mean_0'])
        final_df = final_df.sort_values(by = 'mean_diff', ascending = False).reset_index()
        
        return final_df
            

#cv = CountVectorizer(max_features = 1000)
#X = cv.fit_transform(corpus).toarray()

In [12]:
x = CarData('data/', dependent_variable = 'price')

In [13]:
corpus = x.make_corpus()

In [4]:
with open("corpus.txt", "r") as file:
    corpus = eval(file.readline())

SyntaxError: unexpected EOF while parsing (<string>, line 0)

In [14]:
corpus

['mazda',
 'cx-3',
 '150km',
 '6at',
 '4x4',
 'skypassion',
 '(plus',
 'biala',
 'skóra',
 'plus',
 'safety',
 'navi)',
 'peugeot',
 'partner',
 'opłacony~dokumentacja',
 'przebiegu',
 'kia',
 'sportage',
 'opłacona~napęd',
 'škoda',
 'roomster',
 'opłacona~serwis',
 'aso~klimatronic~grzane',
 'siedzenia',
 'opel',
 'combo',
 'klimatyzacja~przebieg~faktura~polski',
 'salon',
 'mitsubishi',
 'lancer',
 '1.8',
 'lancer,',
 'polska,',
 '24000',
 'przebiegu!',
 'bmw',
 'seria',
 '3',
 '320d',
 '5',
 'e60',
 '535d',
 '272km',
 '2006r.',
 'head',
 'up',
 'start',
 'stop',
 'xenony',
 'bogate',
 'wyposażenie',
 'jeep',
 'grand',
 'cherokee',
 'nan',
 '7',
 '740d',
 'xdrive.',
 'pl.',
 '1',
 'właściciel.',
 'soft',
 'close.',
 'komfortowy',
 'dostęp.',
 'volkswagen',
 'polo',
 'lancia',
 'musa',
 '1.4',
 'piękny',
 'i',
 'oszczędny',
 'honda',
 'accord',
 '2.0',
 'vtec',
 'sport',
 '155',
 'km',
 'infiniti',
 'q50',
 'q50s',
 'uszkodzone',
 'prosto',
 'z',
 'ubezpieczalni',
 '1.6',
 'cr-v',
 '

In [15]:
for part in enumerate(corpus):
    print(x.data['concat_title_subtitle'].str.contains(part[1]).astype('int'))

0       1
1       0
2       0
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    0
Name: concat_title_subtitle, Length: 51998, dtype: int32
0       1
1       0
2       0
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    0
Name: concat_title_subtitle, Length: 51998, dtype: int32
0       1
1       0
2       0
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    1
Name: concat_title_subtitle, Length: 51998, dtype: int32
0       1
1       0
2       0
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    0
Name: concat_title_subtitle, Length: 51998, dtype: int32
0       1
1       0
2       1
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    0
Name: concat_title_subtitle, Length: 51998, dtype: int32
0       1
1       0
2       0
3       0
4       0
       ..
4827    0
4828    0
4829    0
4830    0
4831    0
Name: concat_title_subtitle, Length: 51998, dtype: int3

error: missing ), unterminated subpattern at position 0

In [6]:
results = x.analyse_variables(list_of_variables = corpus)

0/30206


error: nothing to repeat at position 1

In [9]:
pd.set_option('display.max_rows', 10000)

In [57]:
results.sort_values(by = 'mean_diff', ascending = False)

Unnamed: 0,index,variable,mean_1,mean_0,count_1,count_0,mean_diff
0,380,xd,95183.925553,31843.872633,497,47689,63340.05292
1,300,drive,87319.15512,31731.17575,664,47522,55587.97937
2,190,wd,71909.489899,32005.265927,594,47592,39904.223972
3,395,ale,69983.672199,32118.411936,482,47704,37865.260263
4,82,fv,67692.788476,31506.647855,1319,46867,36186.140622
5,199,pakiet,63046.694226,32006.31056,762,47424,31040.383666
6,611,akiet,62983.592398,32006.671278,763,47423,30976.921121
7,548,pak,62311.954603,31998.300867,793,47393,30313.653736
8,98,quattro,61711.662848,32121.976395,611,47575,29589.686453
9,434,sg,61518.505051,32012.199793,792,47394,29506.305257


In [51]:
corpus

['mazda',
 'cx',
 'km',
 'at',
 'x',
 'skypassion',
 'biala',
 'sk',
 'ra',
 'safety',
 'navi',
 'peugeot',
 'partner',
 'op',
 'acony',
 'dokumentacja',
 'przebiegu',
 'kia',
 'sportage',
 'acona',
 'nap',
 'd',
 'koda',
 'roomster',
 'serwis',
 'aso',
 'klimatronic',
 'grzane',
 'siedzenia',
 'opel',
 'combo',
 'klimatyzacja',
 'przebieg',
 'faktura',
 'polski',
 'salon',
 'mitsubishi',
 'lancer',
 'polska',
 'bmw',
 'seria',
 'e',
 'r',
 'head',
 'up',
 'start',
 'stop',
 'xenony',
 'bogate',
 'wyposa',
 'enie',
 'jeep',
 'grand',
 'cherokee',
 'nan',
 'xdrive',
 'pl',
 'w',
 'a',
 'ciciel',
 'soft',
 'close',
 'komfortowy',
 'dost',
 'p',
 'volkswagen',
 'polo',
 'lancia',
 'musa',
 'pi',
 'kny',
 'i',
 'oszcz',
 'dny',
 'honda',
 'accord',
 'vtec',
 'sport',
 'infiniti',
 'q',
 's',
 'uszkodzone',
 'prosto',
 'z',
 'ubezpieczalni',
 'cr',
 'v',
 'lpg',
 'stag',
 'mercedes',
 'benz',
 'clk',
 'automat',
 'avantgarde',
 'alu',
 'szyberdach',
 'xenon',
 'ford',
 'maverick',
 'civic',

In [None]:
x.data.to_csv('data_with_corpus.csv')

In [73]:
cv = CountVectorizer(max_features = 331826)

In [74]:
cv.fit_transform(corpus).toarray()

MemoryError: 

In [18]:
x = x.add_dummies(['ford', 'mercedes', 'suzuki'], ['title', 'sub_title'])

In [19]:
x.data.head()

Unnamed: 0,ID,title,price,sub_title,mileage_km,year,engine_cm3,engine_type,city,province,negotiable,title_ford,title_mercedes,title_suzuki,sub_title_ford,sub_title_mercedes,sub_title_suzuki
0,1,smart fortwo cabrio,18800,cabrio automat nawigacja kamera led szwajcaria,37000.0,2016,999,Benzyna,Bielsko-Biała,Śląskie,"Do negocjacji, Faktura VAT",0,0,0,0,0,0
1,2,ford kuga,34700,nawigacja/kamera cofania/zadbany/,198000.0,2011,2000,Diesel,Dąbrowa,Pomorskie,Do negocjacji,1,0,0,0,0,0
2,3,chevrolet captiva 2.4,27900,2.4 136km klima alus navi dvd skóra 4x4 poleca...,246000.0,2009,2405,Benzyna,Płock,Mazowieckie,Do negocjacji,0,0,0,0,0,0
3,4,renault clio 1.2,37900,1.2 benz 73 km !! idealny !! full !! warszawa,15000.0,2017,1149,Benzyna,Warszawa,Mazowieckie,Do negocjacji,0,0,0,0,0,0
4,5,opel meriva,7900,mały przebieg 109 tyś/km bezwypadkowy,109000.0,2006,1400,Benzyna,Żyrardów,Mazowieckie,,0,0,0,0,0,0


In [48]:
data = x.data

In [58]:
data2 = AddDummy(data, 'title', 'ford')

In [66]:
data['price'].sum()

786919459

In [3]:
x = CarData('data/')

In [4]:
x.outliers()

Offers with price greater than 200000 have been discarded
The number of such offers = 294

Offers with mileage greater than 400000 have been discarded
The number of such offers = 166

Offers with year lower than 1995 have been discarded
The number of such offers = 288

Offers with engine_cm3 greater than 750 and lower than 4000 have been discarded
The number of such offers = 378

Total number of discarded offers = 1126(4.68%) - may be different to the sum of above due to overlap


In [5]:
x.data.head()

Unnamed: 0,ID,title,price,sub_title,mileage_km,year,engine_cm3,engine_type,city,province,negotiable
0,1,Smart Fortwo cabrio,18800,Cabrio Automat Nawigacja Kamera Led Szwajcaria,37000.0,2016,999,Benzyna,Bielsko-Biała,Śląskie,"Do negocjacji, Faktura VAT"
1,2,Ford Kuga,34700,Nawigacja/kamera cofania/Zadbany/,198000.0,2011,2000,Diesel,Dąbrowa,Pomorskie,Do negocjacji
2,3,Chevrolet Captiva 2.4,27900,2.4 136km klima alus navi DVD skóra 4x4 Poleca...,246000.0,2009,2405,Benzyna,Płock,Mazowieckie,Do negocjacji
3,4,Renault Clio 1.2,37900,1.2 BENZ 73 KM !! IDEALNY !! FULL !! Warszawa,15000.0,2017,1149,Benzyna,Warszawa,Mazowieckie,Do negocjacji
4,5,Opel Meriva,7900,mały przebieg 109 tyś/km bezwypadkowy,109000.0,2006,1400,Benzyna,Żyrardów,Mazowieckie,


In [20]:
x.missing, x.duplicates

(737, 8782)

In [64]:
x.describe()

Unnamed: 0,price,mileage_km,year,engine_cm3
count,24073.0,24073.0,24073.0,24073.0
mean,32688.88,159511.92,2009.88,1789.87
std,31457.86,83724.08,5.33,483.63
min,250.0,0.0,1995.0,796.0
25%,12200.0,99000.0,2006.0,1461.0
50%,22900.0,163000.0,2010.0,1794.0
75%,41800.0,217000.0,2014.0,1997.0
max,200000.0,400000.0,2020.0,4000.0


In [None]:
x.scatter_nox()

In [None]:
plt.hist(x.data['mileage_km'], bins = 100)
plt.title('abc')
plt.show()

plt.hist(x.data['price'],bins = 100)
plt.show()

plt.hist(x.data['year'], bins = 100)
plt.show()

plt.hist(x.data['engine_cm3'], bins = 100)
plt.show()

In [None]:
x.data[x.data['price']>1100000]

In [None]:
x.data[x.data['mileage_km']>1500000]

In [None]:
x.data[x.data['year']<1990]

In [None]:
round(pd.DataFrame(
    data = x.data[['price', 'mileage_km', 'year', 'engine_cm3']].describe(),
    columns = x.data[['price', 'mileage_km', 'year', 'engine_cm3']].columns),2)

In [None]:
x.data.isnull().sum()

In [None]:
data = pd.read_csv('run_results_21-04-2020.csv')

In [None]:
data = ReadAndPreprocess(df)

In [None]:
df = pd.read_csv('data/run_results_21-04-2020.csv')
df.head()

In [None]:
df2 = pd.read_csv('data/run_results_25-04_2020.csv')
df2.head()

In [None]:
df3 = pd.read_csv('data/run_results_26-04-2020.csv')
df3.head()

In [None]:
string = re.sub('[^a-zA-Z]', ' ', self.data.reset_index().loc[i]['concat_title_subtitle'])
string = string.split()

In [17]:
string = 'Ala MA KOtA123łćż            '

In [18]:
string_sub = re.sub('[^a-zA-Z]', ' ', string)
string_sub

'Ala MA KOtA                  '

In [20]:
string_split = string_sub.split()
string_split

['Ala', 'MA', 'KOtA']