In [34]:
import pandas as pd
d = pd.DataFrame({"one" : pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd']),
                  "two" : pd.Series([5,6,7,9], index=['a', 'b', 'c', 'd'])})
print(d)

   one  two
a    1    5
b    2    6
c    3    7
d    4    9


In [6]:
d["three"] = 3
print(d)
d["four"] = pd.Series([1,2,3], index=['b', 'c', 'd'])
print(d)

   one  two  four  three
a    1    5   NaN      3
b    2    6   1.0      3
c    3    7   2.0      3
d    4    9   3.0      3
   one  two  four  three
a    1    5   NaN      3
b    2    6   1.0      3
c    3    7   2.0      3
d    4    9   3.0      3


In [7]:
del d["three"]
print(d)

   one  two  four
a    1    5   NaN
b    2    6   1.0
c    3    7   2.0
d    4    9   3.0


In [8]:
df = pd.read_csv("purchase_data_class_4.csv")
## read_csv(filename, header=None)

print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


### Set Column with unique values as the index

In [9]:
df2 = df.set_index("Country", drop=False)
print(df2)

         Country   Age   Salary Purchased
Country                                  
France    France  44.0  72000.0        No
Spain      Spain  27.0  48000.0       Yes
Germany  Germany  30.0  54000.0        No
Spain      Spain  38.0  61000.0        No
Germany  Germany  40.0      NaN       Yes
France    France  35.0  58000.0       Yes
Spain      Spain   NaN  52000.0        No
France    France  48.0  79000.0       Yes
Germany  Germany  50.0  83000.0        No
France    France  37.0  67000.0       Yes


### Extracting a subset of dataframe
#### df2.loc[startrow:endrow, startcolumn:endcolumn]

In [10]:
print(df.loc[0:5, "Country":"Salary"])

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0


### Extracting a column

In [11]:
print(df.loc[:, "Age"])

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
6     NaN
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64


In [12]:
print(df["Age"])

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
6     NaN
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64


### Extracting a Row
#### df.loc["index", :]

In [13]:
print(df.loc[1, :])

Country      Spain
Age             27
Salary       48000
Purchased      Yes
Name: 1, dtype: object


### Extracting specific columns

In [14]:
print(df[["Country", "Salary"]])

   Country   Salary
0   France  72000.0
1    Spain  48000.0
2  Germany  54000.0
3    Spain  61000.0
4  Germany      NaN
5   France  58000.0
6    Spain  52000.0
7   France  79000.0
8  Germany  83000.0
9   France  67000.0


### Extracting Specific Rows

In [15]:
print(df[1:4])

   Country   Age   Salary Purchased
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No


In [16]:
print(df.loc[[1,3,4], :])

   Country   Age   Salary Purchased
1    Spain  27.0  48000.0       Yes
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes


### Applying methods to subsets

In [17]:
mean = df.loc[:, "Salary"].mean()
print(mean)

63777.77777777778


### Separating out features and labels from dataset

In [69]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, 3:].values

print("Features \n", X)
print("Labels \n", Y)

Features 
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
Labels 
 [['No']
 ['Yes']
 ['No']
 ['No']
 ['Yes']
 ['Yes']
 ['No']
 ['Yes']
 ['No']
 ['Yes']]


### Fill out the missing values

In [64]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [72]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = "mean", axis = 0)

In [73]:
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 40.0]
 ['France' 35.0 58000.0]
 ['Spain' 52000.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [50]:
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encode Strings as Numbers

In [51]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()

In [52]:
labelencoder = LabelEncoder()
# For countries
X[:, 0] = labelencoder.fit_transform(X[:, 0])
Y[:, 0] = labelencoder.fit_transform(Y[:, 0])

print("Countries \n", X[:,0])
print("Labels \n", Y[:, 0])

Countries 
 [0 2 1 2 1 0 2 0 1 0]
Labels 
 [0 1 0 0 1 1 0 1 0 1]


### One Hot Encoding

In [53]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0]) #Index of the categorical column

In [57]:
X = onehotencoder.fit_transform(X)
print(X)

  (0, 0)	1.0
  (1, 2)	1.0
  (2, 1)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 2)	1.0
  (7, 0)	1.0
  (8, 1)	1.0
  (9, 0)	1.0
  (0, 3)	44.0
  (0, 4)	72000.0
  (1, 3)	27.0
  (1, 4)	48000.0
  (2, 3)	30.0
  (2, 4)	54000.0
  (3, 3)	38.0
  (3, 4)	61000.0
  (4, 3)	40.0
  (4, 4)	63777.7777778
  (5, 3)	35.0
  (5, 4)	58000.0
  (6, 3)	38.7777777778
  (6, 4)	52000.0
  (7, 3)	48.0
  (7, 4)	79000.0
  (8, 3)	50.0
  (8, 4)	83000.0
  (9, 3)	37.0
  (9, 4)	67000.0


In [58]:
X = X.toarray()
print(X)

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]


### Spilitting the data into train and test set

In [61]:
#Splitting into Train and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 0)
print("X_train \n", X_train)
print("Y_train \n", y_train)
print("X_test \n", X_test)
print("Y_test \n", y_test)

X_train 
 [[  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]]
Y_train 
 [[1]
 [0]
 [1]
 [0]
 [0]
 [1]]
X_test 
 [[  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]
Y_test 
 [[0]
 [0]
 [1]
 [1]]


### Scaling the data

In [62]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test) #Trivia : Why only transform?
print("X_train \n", X_train)
print("X_test \n", X_test)

X_train 
 [[-1.          0.          1.         -1.72380603 -1.26408415]
 [-1.          0.          1.          0.04734201 -0.8941083 ]
 [ 1.          0.         -1.          1.43418434  1.60322868]
 [-1.          0.          1.         -0.0696206  -0.06166264]
 [ 1.          0.         -1.          0.83266236  0.95577094]
 [ 1.          0.         -1.         -0.52076208 -0.33914453]]
X_test 
 [[-1.          1.         -1.         -1.27266455 -0.70912038]
 [-1.          1.         -1.          1.73494533  1.97320453]
 [-1.          1.         -1.          0.23114039  0.19526503]
 [ 1.          0.         -1.         -0.22000109  0.49330113]]
