# Aula 01 - Introdução ao Scikit-Learn
#### Material baseado no livro "Practical Machine Learning with Python" Sarkar, D. (et al.) (2018) 

---

# Numpy
## Arrays

In [1]:
import numpy as np
arr = np.array([1,3,4,5,6])
arr

array([1, 3, 4, 5, 6])

In [2]:
arr.shape

(5,)

In [3]:
arr.dtype

dtype('int32')

In [4]:
arr = np.array([1,'st','er',3])
arr.dtype

dtype('<U11')

In [5]:
#np.sum(arr)

### Creating arrays

In [None]:
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])
arr.shape

In [None]:
arr

In [None]:
arr = np.zeros((2,4))
arr

In [None]:
arr = np.ones((2,4))
arr

In [None]:
arr = np.identity(3)
arr

In [None]:
arr = np.random.randn(3,4)
arr

### Accessing array elements
#### Simple indexing

In [None]:
arr[1]

In [None]:
arr = np.arange(12).reshape(2,2,3)
arr

In [None]:
arr[0]

In [None]:
arr = np.arange(10)
arr[5:]


In [None]:
arr[5:8]

In [None]:
arr[:-5]

In [None]:
arr = np.arange(12).reshape(2,2,3)
arr

In [None]:
arr[1:2]

In [None]:
arr = np.arange(27).reshape(3,3,3)
arr

In [None]:
arr[:,:,2]

In [None]:
arr[...,2]

#### Advanced Indexing

In [None]:
arr = np.arange(9).reshape(3,3)
arr

In [None]:
arr[[0,1,2],[1,0,0]]

##### Boolean Indexing

In [None]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data

In [None]:
city_data[cities =="delhi"]

In [None]:
city_data[city_data >0]

In [None]:
city_data[city_data >0] = 0
city_data


#### Operations on arrays

In [None]:
arr = np.arange(15).reshape(3,5)
arr

In [None]:
arr + 5

In [None]:
arr * 2

In [None]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)
arr2 + arr1


In [None]:
arr1

In [None]:
arr2

In [None]:
arr1 = np.random.randn(5,3)
arr1

In [None]:
np.modf(arr1)

#### Linear algebra using numpy

In [None]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])
A.dot(B)

In [None]:
A = np.arange(15).reshape(3,5)
A.T

# Pandas
## Data frames

In [None]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)

In [None]:
df = pd.DataFrame(d)

### Reading in data

In [None]:
city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')

In [None]:
city_data.head(n=10)

In [None]:
city_data.tail()

In [None]:
series_es = city_data.lat

In [None]:
type(series_es)

In [None]:
series_es[1:10:2]

In [None]:
series_es[:7]

In [None]:
series_es[:-7315]

In [None]:
city_data[:7]

In [None]:
city_data.iloc[:5,:4]

In [None]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]

In [None]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)

In [None]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])

### Operations on dataframes

In [None]:
nparray = df.values
type(nparray)

In [None]:
from numpy import nan
df.iloc[4,2] = nan

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
columns_numeric = ['lat','lng','pop']

In [None]:
city_data[columns_numeric].mean()

In [None]:
city_data[columns_numeric].sum()

In [None]:
city_data[columns_numeric].count()

In [None]:
city_data[columns_numeric].median()

In [None]:
city_data[columns_numeric].quantile(0.8)

In [None]:
city_data[columns_numeric].sum(axis = 1).head()

In [None]:
city_data[columns_numeric].describe()

In [None]:
city_data1 = city_data.sample(3)

### Concatanating data frames

In [None]:
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1,city_data2])
city_data_combine

In [None]:
country_data = city_data[['iso3','country']].drop_duplicates()

In [None]:
country_data.shape

In [None]:
country_data.head()

In [None]:
del(city_data['country'])

In [None]:
city_data.merge(country_data, 'inner').head()

# Scikit-learn

In [None]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target

In [None]:
X[:5]

In [None]:
y[:10]

In [None]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

## Scikit example regression

In [2]:
from sklearn import datasets
from sklearn.linear_model import Lasso
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

learner = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

learner.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False)

In [3]:
y_pred = learner.predict(X_test)
y_pred

array([167.65590881, 169.41804258, 129.69030102, 188.51035557,
       161.07488611, 140.22648857, 181.632146  , 147.46393226,
       156.62543629, 162.42765884, 173.00489271, 202.95330137,
       218.2143445 , 192.50603431, 175.56187783, 198.88195891,
       158.32228911, 191.03430418, 174.08293369, 125.64306682,
       153.66285133, 128.00513265, 207.319705  , 158.86228461,
       113.02317049, 116.57261186, 208.64417948, 148.43243797,
       131.72581351, 163.30914644, 133.99709136, 163.48743068,
       150.63595167, 137.84769342, 149.37118579, 149.79279634,
       178.53893884, 141.80797043, 140.22648857, 109.83195519,
       196.27872624, 117.03775828, 118.86335256, 176.62378086,
       192.33997016, 136.38667085, 129.74421689, 185.52048555,
        93.13821335, 158.0847714 , 162.86777549, 123.87843697,
       189.10327758, 148.98373553, 153.92278437, 148.15712016,
       223.41891182, 222.40004333, 181.0746507 , 162.6229596 ,
       109.83195519, 188.44825201, 139.0914012 , 132.82

In [4]:
np.mean(abs(y_pred - y_test))

51.37617602903816