# Vilnius Neighbourhoods and Streets
## By Svajune Klimasauskaite

In [22]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import matplotlib.pyplot as plt
import seaborn as sb
import datetime as dt
#from google.cloud import storage

%matplotlib inline

## Load Data

In [2]:
#Load data from the file which is stored in Dataset repository. 
neigh=pd.read_csv("Dataset/vilniaus_seniunijos.csv", sep=';')
neigh.sample(5)

Unnamed: 0,neighbourhood,street
169,Antakalnis,Laurų g. (70)
54,Antakalnis,Girininkijos g. (26)
620,Naujininkai,Alytaus g. (19)
804,Naujininkai,Miškinių Sodų g. (29)
107,Antakalnis,Juodvarnių Sodų 6-oji g. (9)


**The following data cleaning and augmentation tasks should be performed:** 
1. Add a new column Vilnius, so that a hierarchy of Municipalities and Neighbourhoods could be created.  
2. The numbers in bracket indicate the number of houses in the street, so I need to move it to the new column as well as remove this info from street column. 
3. Some of the streets are overlaping within several neighbourhoods, therefore I extract this info and add in the new column as well as clean the house column.
4. Street in the Collective Garden Area, kas indication in the street name, therefore I want to move it into a separate column. 

## Data Cleaning and Augmenting

### 1. Add Municipality Column

In [3]:
neigh['municipality'] = 'Vilnius'
neigh.sample(1)

Unnamed: 0,neighbourhood,street,municipality
1841,Verkiai,Akademijos g. (3),Vilnius


### 2. Add Number of Houses

#### Create New Column

In [4]:
neigh['houses'] = neigh['street'].str.split('(').str[1].str.split(')').str[0]
neigh['houses'] = neigh['houses'].astype(str)
neigh.houses.unique()

array(['28', '17', '48', '147', '12', '22', '8', '6', '5', '3', '16',
       '25', '40', '55', '39', '87', '43', 'nan', '10', '21', '19', '35',
       '14', '29', '13', '9 iš 112*', '7', '61', '50', '27', '34', '15',
       '42', '36', '26', '20', '101', '18', '77', '38', '24', '56', '11',
       '30', '32', '51', '9', '1', '35 iš 96*', '1 iš 9*', '45', '91',
       '4', '2', '60', '65', '70', '47', '57', '23', '49', '41', '37',
       '88', '160', '94', '12 iš 41*', '69', '33', '124', '53 iš 82*',
       '31', '35 iš 66*', '46', '73', '72', '67', '42 iš 44*', '133',
       '54', '62 iš 124*', '11 iš 124*', '46 iš 53*', '282', '38 iš 74*',
       '12 iš 104*', '99', '78 iš 80*', '14 iš 16*', '1 iš 52*', '63',
       '13 iš 19*', '126', '29 iš 146*', '32 iš 34*', '15 iš 67*',
       '14 iš 124*', '5 iš 15*', '11 iš 23*', '57 iš 78*', '9 iš 47*',
       '45 iš 67*', '9 iš 16*', '1 iš 29*', '21 iš 28*', '34 iš 45*',
       '70 iš 81*', '23 iš 40*', '74 iš 89*', '49 iš 52*', '8 iš 40*',
  

#### Clean Street Column

In [5]:
neigh['street'] = neigh['street'].str.split('(').str[0]
neigh.sample(5)

Unnamed: 0,neighbourhood,street,municipality,houses
1620,Rasos,Stalių g.,Vilnius,16
532,Naujamiestis,Amatų g.,Vilnius,4
62,Antakalnis,Gvazdikų Sodų 11-oji g.,Vilnius,15
2057,Verkiai,Krakiškių Sodų 3-ioji g.,Vilnius,15
2100,Verkiai,Molėtų pl.,Vilnius,54


### 3. Add Indication for Overlaping Streets within Several Neighbourhoods

#### Create New Column

In [6]:
overlap = []

for i in neigh['houses']:
    if '*' in i: 
        overlap.append(i)
    else:
        overlap.append('No')
        
neigh['overlap'] = overlap

neigh.sample(10)

Unnamed: 0,neighbourhood,street,municipality,houses,overlap
1749,Senamiestis,T. Kosciuškos g.,Vilnius,35,No
1722,Senamiestis,Reformatų skv.,Vilnius,,No
796,Naujininkai,Liepkalnio g.,Vilnius,74 iš 242*,74 iš 242*
364,Antakalnis,Vikučionių g.,Vilnius,12,No
2327,Viršuliškės,Viršuliškių skg.,Vilnius,23,No
1958,Verkiai,Birelių Sodų 21-oji g.,Vilnius,11,No
166,Antakalnis,Latežerio g.,Vilnius,6,No
1260,Paneriai,Lentvario g.,Vilnius,62,No
1046,Naujoji Vilnia,R. Mackonio g.,Vilnius,19,No
480,Fabijoniškės,S. Stanevičiaus g.,Vilnius,126,No


#### Clean House Column

In [7]:
houses = []

for i in neigh['houses']:
    if 'iš' in i: 
        houses.append(i.split('iš')[0])
    elif '*' in i:
        houses.append(i.split('*')[0])
    else:
        houses.append(i)
        
neigh['houses'] = houses

In [8]:
neigh.houses.unique()

array(['28', '17', '48', '147', '12', '22', '8', '6', '5', '3', '16',
       '25', '40', '55', '39', '87', '43', 'nan', '10', '21', '19', '35',
       '14', '29', '13', '9 ', '7', '61', '50', '27', '34', '15', '42',
       '36', '26', '20', '101', '18', '77', '38', '24', '56', '11', '30',
       '32', '51', '9', '1', '35 ', '1 ', '45', '91', '4', '2', '60',
       '65', '70', '47', '57', '23', '49', '41', '37', '88', '160', '94',
       '12 ', '69', '33', '124', '53 ', '31', '46', '73', '72', '67',
       '42 ', '133', '54', '62 ', '11 ', '46 ', '282', '38 ', '99', '78 ',
       '14 ', '63', '13 ', '126', '29 ', '32 ', '15 ', '5 ', '57 ', '45 ',
       '21 ', '34 ', '70 ', '23 ', '74 ', '49 ', '8 ', '40 ', '27 ', '6 ',
       '17 ', '66', '3 ', '64 ', '113', '62', '81 ', '104 ', '28 ', '37 ',
       '59', '44', '81', '52', '71', '90', '64', '20 ', '96', '127',
       '10 ', '80', '61 ', '24 ', '104', '142', '39 ', '53', '118', '2 ',
       '107', '31 ', '95', '7 ', '66 ', '131', '86', 

### 4. Add Street Type

In [10]:
str_type = []

for i in neigh['street']:
    if 'Sodų' in i: 
        str_type.append('Sodas')
    else:
        str_type.append('Kitas')
        
neigh['str_type'] = str_type

In [13]:
neigh.sample(5)

Unnamed: 0,neighbourhood,street,municipality,houses,overlap,str_type
2065,Verkiai,Kryžiokų g.,Vilnius,420,No,Kitas
1419,Pašilaičiai,Vincento Gorskio g.,Vilnius,26,No,Kitas
885,Naujininkai,Salininkų Sodų 5-oji g.,Vilnius,34,No,Sodas
965,Naujoji Vilnia,M. Davainio-Silvestraičio g.,Vilnius,4,No,Kitas
1245,Paneriai,Kazimiero Jokanto g.,Vilnius,25,No,Kitas


## Save the File 

In [21]:
#df.to_csv(neigh, '⁨Google Drive⁩/My Drive⁩/Vilniaus duomenys⁩/Vilnius.csv', sep=';')