In [1]:
import pandas as pd
# turn of warning messages
pd.options.mode.chained_assignment = None  # default='warn'

# get data
df = pd.read_csv('https://raw.githubusercontent.com/dipanjanS/practical-machine-learning-with-python/master/notebooks/Ch01_Machine_Learning_Basics/student_records.csv')
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [2]:
# get features and corresponding outcomes
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [3]:
# view features
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [4]:
# view outcome labels
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [5]:
# list down features based on type
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

In [6]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()  #standardization of numbers

# fit scaler on numeric features
ss.fit(training_features[numeric_feature_names])

# scale numeric features now
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

# view updated featureset
training_features

  return self.partial_fit(X, y)
  


Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [7]:
training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)
# view newly engineering features
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [8]:
# get list of new categorical features
categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))

In [9]:
categorical_engineered_features

['OverallGrade_E',
 'OverallGrade_B',
 'OverallGrade_A',
 'OverallGrade_C',
 'Obedient_N',
 'OverallGrade_F',
 'Obedient_Y']

In [10]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# fit the model
lr = LogisticRegression() 
model = lr.fit(training_features, np.array(outcome_labels['Recommend']))
# view model parameters
model



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
# simple evaluation on training data
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

# evaluate model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 100.0 %
Classification Stats:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

   micro avg       1.00      1.00      1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [12]:
from sklearn.externals import joblib
import os
# save models to be deployed on your server
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler') 
    
joblib.dump(model, r'Model/model.pickle') 
joblib.dump(ss, r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

In [13]:
# load model and scaler objects
model = joblib.load(r'Model/model.pickle')
scaler = joblib.load(r'Scaler/scaler.pickle')

In [14]:
## data retrieval
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
                  {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [15]:
## data preparation
prediction_features = new_data[feature_names]

# scaling
prediction_features[numeric_feature_names] = scaler.transform(prediction_features[numeric_feature_names])

# engineering categorical variables
prediction_features = pd.get_dummies(prediction_features, columns=categoricial_feature_names)

# view feature set
prediction_features

  """


Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [16]:
# add missing categorical feature columns
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features
for feature in missing_features:
    # add zeros since feature is absent in these data samples
    prediction_features[feature] = [0] * len(prediction_features) 

# view final feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_E,OverallGrade_B,OverallGrade_C
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [17]:
## predict using model
predictions = model.predict(prediction_features)

## display results
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes


In [16]:
import matplotlib.pyplot as plt
from nltk.parse.stanford import StanfordParser
import numpy as np

%matplotlib inline

In [19]:
np.__version__

'1.15.4'

In [33]:
sentence = 'The quick brown fox jumps over the lazy dog'

# create parser object
scp = StanfordParser(path_to_jar='Sample_Files/stanford-parser.jar/',
                   path_to_models_jar='Sample_Files/stanford-parser-3.5.2-models.jar')

scp

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  """


LookupError: Could not find stanford-parser\.jar jar file at Sample_Files/stanford-parser.jar/

In [21]:
## vectors

x = [1, 2, 3, 4, 5]
x

[1, 2, 3, 4, 5]

In [22]:
# using numpy
import numpy as np
x = np.array([1, 2, 3, 4, 5])

print(x)
print(type(x))

[1 2 3 4 5]
<class 'numpy.ndarray'>


In [23]:
## matrices

m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

# view matrix
print(m)

# view dimensions
print(m.shape)

[[1 5 2]
 [4 7 4]
 [2 0 9]]
(3, 3)


In [24]:
# matrix transpose
print('Matrix Transpose:\n', m.transpose(), '\n')

Matrix Transpose:
 [[1 4 2]
 [5 7 0]
 [2 4 9]] 



In [25]:
# matrix determinant
print ('Matrix Determinant:', np.linalg.det(m), '\n')

Matrix Determinant: -105.00000000000006 



In [26]:
# matrix inverse
m_inv = np.linalg.inv(m)
print ('Matrix inverse:\n', m_inv, '\n')

Matrix inverse:
 [[-0.6         0.42857143 -0.05714286]
 [ 0.26666667 -0.04761905 -0.03809524]
 [ 0.13333333 -0.0952381   0.12380952]] 



In [27]:
# identity matrix (result of matrix x matrix_inverse)
iden_m =  np.dot(m, m_inv)
iden_m = np.round(np.abs(iden_m), 0)
print ('Product of matrix and its inverse:\n', iden_m)

Product of matrix and its inverse:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [28]:
# eigendecomposition
m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

eigen_vals, eigen_vecs = np.linalg.eig(m)

print('Eigen Values:', eigen_vals, '\n')
print('Eigen Vectors:\n', eigen_vecs)

Eigen Values: [-1.32455532 11.32455532  7.        ] 

Eigen Vectors:
 [[-0.91761521  0.46120352 -0.46829291]
 [ 0.35550789  0.79362022 -0.74926865]
 [ 0.17775394  0.39681011  0.46829291]]


In [29]:
# SVD
m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

U, S, VT = np.linalg.svd(m)

print ('Getting SVD outputs:-\n')
print('U:\n', U, '\n')
print('S:\n', S, '\n')
print('VT:\n', VT, '\n')

Getting SVD outputs:-

U:
 [[ 0.3831556  -0.39279153  0.83600634]
 [ 0.68811254 -0.48239977 -0.54202545]
 [ 0.61619228  0.78294653  0.0854506 ]] 

S:
 [12.10668383  6.91783499  1.25370079] 

VT:
 [[ 0.36079164  0.55610321  0.74871798]
 [-0.10935467 -0.7720271   0.62611158]
 [-0.92621323  0.30777163  0.21772844]] 



In [30]:
# descriptive statistics
import scipy as sp
import numpy as np

# get data
nums = np.random.randint(1,20, size=(1,15))[0]
print('Data: ', nums)

Data:  [12  3 15 19  7  5 12  4  4 14  6 15  1  4  6]


In [31]:
# get descriptive stats
print ('Mean:', sp.mean(nums))
print ('Median:', sp.median(nums))
print ('Mode:', sp.stats.mode(nums))
print ('Standard Deviation:', sp.std(nums))
print ('Variance:', sp.var(nums))
print ('Skew:', sp.stats.skew(nums))
print ('Kurtosis:', sp.stats.kurtosis(nums))

Mean: 8.466666666666667
Median: 6.0
Mode: ModeResult(mode=array([4]), count=array([3]))
Standard Deviation: 5.3149683807986
Variance: 28.248888888888892
Skew: 0.4716642617807957
Kurtosis: -1.110129188200344


In [34]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target

In [35]:
X[:5]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])

In [36]:
y[:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [37]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

In [38]:
from sklearn import datasets
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

scores = list()
scores_std = list()

estimator = GridSearchCV(lasso,
                         param_grid = dict(alpha=alphas))

estimator.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.00000e-04, 1.32035e-04, 1.74333e-04, 2.30181e-04, 3.03920e-04,
       4.01281e-04, 5.29832e-04, 6.99564e-04, 9.23671e-04, 1.21957e-03,
       1.61026e-03, 2.12611e-03, 2.80722e-03, 3.70651e-03, 4.89390e-03,
       6.46167e-03, 8.53168e-03, 1.12648e-02, 1.48735e-02, 1.96383e-02,
       2.59294e-02, 3.42360e-02, 4.52035e-02, 5.96846e-02, 7.88046e-02,
       1.04050e-01, 1.37382e-01, 1.81393e-01, 2.39503e-01, 3.16228e-01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
diabetes.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [40]:
diabetes['data']

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [41]:
estimator.best_score_

0.4654063759023531

In [42]:
estimator.best_estimator_

Lasso(alpha=0.02592943797404667, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [43]:
estimator.predict(X_test)

array([203.42104984, 177.6595529 , 122.62188598, 212.81136958,
       173.61633075, 114.76145025, 202.36033584, 171.70767813,
       164.28694562, 191.29091477, 191.41279009, 288.2772433 ,
       296.47009002, 234.53378413, 210.61427168, 228.62812055,
       156.74489991, 225.08834492, 191.75874632, 102.81600989,
       172.373221  , 111.20843429, 290.22242876, 178.64605207,
        78.13722832,  86.35832297, 256.41378529, 165.99622543,
       121.29260976, 153.48718848, 163.09835143, 180.0932902 ,
       161.4330553 , 155.80211635, 143.70181085, 126.13753819,
       181.06471818, 105.03679977, 131.0479936 ,  90.50606427,
       252.66486639,  84.84786067,  59.41005358, 184.51368208,
       201.46598714, 129.96333913,  90.65641478, 200.10932516,
        55.2884802 , 171.60459062, 195.40750666, 122.14139787,
       231.72783897, 159.49750022, 160.32104862, 165.53701866,
       260.73217736, 259.77213787, 204.69526082, 185.66480969,
        61.09821961, 209.9214333 , 108.50410841, 141.18

In [44]:
y_test

array([109., 180., 144., 163., 147.,  97., 220., 190., 109., 191., 122.,
       230., 242., 248., 249., 192., 131., 237.,  78., 135., 244., 199.,
       270., 164.,  72.,  96., 306.,  91., 214.,  95., 216., 263., 178.,
       113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,  77., 109.,
       272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,  58., 262.,
       206., 233., 242., 123., 167.,  63., 197.,  71., 168., 140., 217.,
       121., 235., 245.,  40.,  52., 104., 132.,  88.,  69., 219.,  72.,
       201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,  43., 198.,
       242., 232., 175.,  93., 168., 275., 293., 281.,  72., 140., 189.,
       181., 209., 136., 261., 113., 131., 174., 257.,  55.,  84.,  42.,
       146., 212., 233.,  91., 111., 152., 120.,  67., 310.,  94., 183.,
        66., 173.,  72.,  49.,  64.,  48., 178., 104., 132., 220.,  57.])

In [45]:
df = pd.DataFrame({'Pred':estimator.predict(X_test), 'Actual':y_test})
df

Unnamed: 0,Pred,Actual
0,203.421050,109.0
1,177.659553,180.0
2,122.621886,144.0
3,212.811370,163.0
4,173.616331,147.0
5,114.761450,97.0
6,202.360336,220.0
7,171.707678,190.0
8,164.286946,109.0
9,191.290915,191.0


In [47]:
import numpy
import theano.tensor as T
from theano import function
x = T.dscalar('x')
y = T.dscalar('y')
z = x + y

In [48]:
f = function([x, y], z)
f(8, 2)

array(10.)

In [1]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)

Unnamed: 0,city,data
0,Delhi,1000
1,Banglaore,2000
2,Mumbai,1000


In [2]:
df = pd.DataFrame(d)

In [3]:
city_data = pd.read_csv(filepath_or_buffer="https://raw.githubusercontent.com/dipanjanS/practical-machine-learning-with-python/master/notebooks/Ch02_The_Python_ML_Ecosystem/simplemaps-worldcities-basic.csv")

In [4]:
city_data.head(n=10)

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar
7,Taloqan,Taloqan,36.729999,69.540004,64256.0,Afghanistan,AF,AFG,Takhar
8,Mahmud-E Eraqi,Mahmud-E Eraqi,35.016696,69.333301,7407.0,Afghanistan,AF,AFG,Kapisa
9,Mehtar Lam,Mehtar Lam,34.65,70.166701,17345.0,Afghanistan,AF,AFG,Laghman


In [5]:
city_data.tail()

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
7317,Mutare,Mutare,-18.970019,32.650038,216785.0,Zimbabwe,ZW,ZWE,Manicaland
7318,Kadoma,Kadoma,-18.330006,29.909947,56400.0,Zimbabwe,ZW,ZWE,Mashonaland West
7319,Chitungwiza,Chitungwiza,-18.000001,31.100003,331071.0,Zimbabwe,ZW,ZWE,Harare
7320,Harare,Harare,-17.81779,31.044709,1557406.5,Zimbabwe,ZW,ZWE,Harare
7321,Bulawayo,Bulawayo,-20.169998,28.580002,697096.0,Zimbabwe,ZW,ZWE,Bulawayo


In [6]:
series_es = city_data.lat

In [7]:
type(series_es)

pandas.core.series.Series

In [8]:
series_es[1:10:2]

1    34.516701
3    31.112001
5    32.850000
7    36.729999
9    34.650000
Name: lat, dtype: float64

In [9]:
series_es[:7]

0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [10]:
series_es[:-7315]

0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [11]:
city_data[:7]

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar


In [12]:
city_data.iloc[:5,:4]

Unnamed: 0,city,city_ascii,lat,lng
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333
1,Chaghcharan,Chaghcharan,34.516701,65.250001
2,Lashkar Gah,Lashkar Gah,31.582998,64.36
3,Zaranj,Zaranj,31.112001,61.886998
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699


In [13]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]

Unnamed: 0,lat,lng
360,-34.602502,-58.397531
1171,-23.55868,-46.62502
2068,31.216452,121.436505
3098,28.669993,77.230004
3110,19.01699,72.856989
3492,35.685017,139.751407
4074,19.442442,-99.130988
4513,24.869992,66.990009
5394,55.752164,37.615523
6124,41.104996,29.010002


In [14]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,city,city_ascii,lat,lng,population,country,iso2,iso3,province
360,,,,,,,,,
1171,,,,,,,,,
2068,,,,,,,,,
3098,,,,,,,,,
3110,Mumbai,Mumbai,19.01699,72.856989,15834918.0,India,IN,IND,Maharashtra
3492,Tokyo,Tokyo,35.685017,139.751407,22006299.5,Japan,JP,JPN,Tokyo
4074,,,,,,,,,
4513,,,,,,,,,
5394,,,,,,,,,
6124,,,,,,,,,


In [17]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])

In [18]:
nparray = df.values
type(nparray)

numpy.ndarray

In [19]:
from numpy import nan
df.iloc[4,2] = nan

In [20]:
df

Unnamed: 0,A,B,C
0,-0.835956,-0.805907,0.5144
1,-0.589583,-0.163839,-0.900284
2,-1.125043,-0.047118,-0.488335
3,-0.432538,-1.998501,-0.522701
4,0.779593,0.32543,
5,-2.510527,-0.902316,-1.866097
6,0.082142,-0.303555,-0.064109
7,-0.751863,2.271587,0.418232


In [21]:
df.fillna(0)

Unnamed: 0,A,B,C
0,-0.835956,-0.805907,0.5144
1,-0.589583,-0.163839,-0.900284
2,-1.125043,-0.047118,-0.488335
3,-0.432538,-1.998501,-0.522701
4,0.779593,0.32543,0.0
5,-2.510527,-0.902316,-1.866097
6,0.082142,-0.303555,-0.064109
7,-0.751863,2.271587,0.418232


In [22]:
columns_numeric = ['lat','lng','pop']

In [23]:
city_data[columns_numeric].mean()

lat        20.662876
lng        10.711914
pop    265463.071633
dtype: float64

In [24]:
city_data[columns_numeric].sum()

lat    1.512936e+05
lng    7.843263e+04
pop    1.943721e+09
dtype: float64

In [25]:
city_data[columns_numeric].count()

lat    7322
lng    7322
pop    7322
dtype: int64

In [26]:
city_data[columns_numeric].median()

lat       26.792730
lng       18.617509
pop    61322.750000
dtype: float64

In [27]:
city_data[columns_numeric].quantile(0.8)

lat        46.852480
lng        89.900018
pop    269210.000000
Name: 0.8, dtype: float64

In [28]:
city_data[columns_numeric].sum(axis = 1).head()

0      3095.116300
1     15099.766702
2    201641.942998
3     49943.998999
4     10098.499997
dtype: float64

In [29]:
city_data[columns_numeric].describe()

Unnamed: 0,lat,lng,pop
count,7322.0,7322.0,7322.0
mean,20.662876,10.711914,265463.1
std,29.134818,79.044615,828762.2
min,-89.982894,-179.589979,-99.0
25%,-0.32471,-64.788472,17344.25
50%,26.79273,18.617509,61322.75
75%,43.575448,73.103628,200172.6
max,82.483323,179.383304,22006300.0


In [31]:
city_data1 = city_data.sample(3)
city_data1

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
3152,Blitar,Blitar,-8.069599,112.149991,132416.0,Indonesia,ID,IDN,Jawa Timur
5092,Barabinsk,Barabinsk,55.357279,78.351899,29888.5,Russia,RU,RUS,Novosibirsk
5816,Dayr az Zawr,Dayr az Zawr,35.330387,40.129995,275853.0,Syria,SY,SYR,Dayr Az Zawr


In [32]:
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1,city_data2])
city_data_combine

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
3152,Blitar,Blitar,-8.069599,112.149991,132416.0,Indonesia,ID,IDN,Jawa Timur
5092,Barabinsk,Barabinsk,55.357279,78.351899,29888.5,Russia,RU,RUS,Novosibirsk
5816,Dayr az Zawr,Dayr az Zawr,35.330387,40.129995,275853.0,Syria,SY,SYR,Dayr Az Zawr
7264,Al Mukalla,Al Mukalla,14.541165,49.125931,194080.5,Yemen,YE,YEM,Hadramawt
3318,Sligo,Sligo,54.267061,-8.483317,17214.0,Ireland,IE,IRL,Sligo
1001,Itamaraju,Itamaraju,-17.039594,-39.529949,35055.0,Brazil,BR,BRA,Bahia


In [33]:
df1 = pd.DataFrame({'col1': ['col10', 'col11', 'col12', 'col13'],
                    'col2': ['col20', 'col21', 'col22', 'col23'],
                    'col3': ['col30', 'col31', 'col32', 'col33'],
                    'col4': ['col40', 'col41', 'col42', 'col43']},
                   index=[0, 1, 2, 3])
df1

Unnamed: 0,col1,col2,col3,col4
0,col10,col20,col30,col40
1,col11,col21,col31,col41
2,col12,col22,col32,col42
3,col13,col23,col33,col43


In [34]:
df4 = pd.DataFrame({'col2': ['col22', 'col23', 'col26', 'col27'],
                    'Col4': ['Col42', 'Col43', 'Col46', 'Col47'],
                    'col6': ['col62', 'col63', 'col66', 'col67']},
                   index=[2, 3, 6, 7])

pd.concat([df1,df4], axis=1)

Unnamed: 0,col1,col2,col3,col4,col2.1,Col4,col6
0,col10,col20,col30,col40,,,
1,col11,col21,col31,col41,,,
2,col12,col22,col32,col42,col22,Col42,col62
3,col13,col23,col33,col43,col23,Col43,col63
6,,,,,col26,Col46,col66
7,,,,,col27,Col47,col67


In [35]:
country_data = city_data[['iso3','country']].drop_duplicates()

In [36]:
country_data.shape

(223, 2)

In [37]:
country_data.head()

Unnamed: 0,iso3,country
0,AFG,Afghanistan
33,ALD,Aland
34,ALB,Albania
60,DZA,Algeria
111,ASM,American Samoa


In [38]:
del(city_data['country'])

In [39]:
city_data.merge(country_data, 'inner').head()

Unnamed: 0,city,city_ascii,lat,lng,pop,iso2,iso3,province,country
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,AF,AFG,Badghis,Afghanistan
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,AF,AFG,Ghor,Afghanistan
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,AF,AFG,Hilmand,Afghanistan
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,AF,AFG,Nimroz,Afghanistan
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,AF,AFG,Uruzgan,Afghanistan


In [40]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target

In [41]:
X[:5]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])

In [42]:
y[:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [48]:
diabetes.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [43]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

In [44]:
from sklearn import datasets
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

scores = list()
scores_std = list()

estimator = GridSearchCV(lasso,
                         param_grid = dict(alpha=alphas))

estimator.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.00000e-04, 1.32035e-04, 1.74333e-04, 2.30181e-04, 3.03920e-04,
       4.01281e-04, 5.29832e-04, 6.99564e-04, 9.23671e-04, 1.21957e-03,
       1.61026e-03, 2.12611e-03, 2.80722e-03, 3.70651e-03, 4.89390e-03,
       6.46167e-03, 8.53168e-03, 1.12648e-02, 1.48735e-02, 1.96383e-02,
       2.59294e-02, 3.42360e-02, 4.52035e-02, 5.96846e-02, 7.88046e-02,
       1.04050e-01, 1.37382e-01, 1.81393e-01, 2.39503e-01, 3.16228e-01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
estimator.best_score_

0.4654063759023531

In [46]:
estimator.best_estimator_

Lasso(alpha=0.02592943797404667, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [47]:
estimator.predict(X_test)

array([203.42104984, 177.6595529 , 122.62188598, 212.81136958,
       173.61633075, 114.76145025, 202.36033584, 171.70767813,
       164.28694562, 191.29091477, 191.41279009, 288.2772433 ,
       296.47009002, 234.53378413, 210.61427168, 228.62812055,
       156.74489991, 225.08834492, 191.75874632, 102.81600989,
       172.373221  , 111.20843429, 290.22242876, 178.64605207,
        78.13722832,  86.35832297, 256.41378529, 165.99622543,
       121.29260976, 153.48718848, 163.09835143, 180.0932902 ,
       161.4330553 , 155.80211635, 143.70181085, 126.13753819,
       181.06471818, 105.03679977, 131.0479936 ,  90.50606427,
       252.66486639,  84.84786067,  59.41005358, 184.51368208,
       201.46598714, 129.96333913,  90.65641478, 200.10932516,
        55.2884802 , 171.60459062, 195.40750666, 122.14139787,
       231.72783897, 159.49750022, 160.32104862, 165.53701866,
       260.73217736, 259.77213787, 204.69526082, 185.66480969,
        61.09821961, 209.9214333 , 108.50410841, 141.18

In [49]:
import numpy
import theano.tensor as T
from theano import function
x = T.dscalar('x')
y = T.dscalar('y')
z = x + y

In [50]:
f = function([x, y], z)
f(8, 2)

array(10.)

In [12]:
import tensorflow as tf
hello = tf.constant('Hello, TensorFlow!')
sess = tf.Session()
print(sess.run(hello))

b'Hello, TensorFlow!'


In [13]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X_train = cancer.data[:340]
y_train = cancer.target[:340]

X_test = cancer.data[340:]
y_test = cancer.target[340:]

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [14]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [15]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [16]:
model.fit(X_train, y_train,
          epochs=20,
          batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ead8231748>

In [17]:
predictions = model.predict_classes(X_test)

In [18]:
from sklearn import metrics

print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))
print(metrics.classification_report(y_true=y_test, y_pred=predictions))

Accuracy: 0.8471615720524017
              precision    recall  f1-score   support

           0       0.63      0.89      0.74        55
           1       0.96      0.83      0.89       174

   micro avg       0.85      0.85      0.85       229
   macro avg       0.79      0.86      0.81       229
weighted avg       0.88      0.85      0.85       229



In [19]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ead897c9e8>

In [20]:
predictions = model.predict_classes(X_test)

In [22]:
print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))
print(metrics.classification_report(y_true=y_test, y_pred=predictions))

Accuracy: 0.759825327510917
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.76      1.00      0.86       174

   micro avg       0.76      0.76      0.76       229
   macro avg       0.38      0.50      0.43       229
weighted avg       0.58      0.76      0.66       229

