In [19]:
import pandas as pd
import numpy as np

How to open data files in pandas
You might have your data in .csv files or SQL tables. Maybe Excel files. Or .tsv files. Or something else. But the goal is the same in all cases. If you want to analyze that data using pandas, the first step will be to read it into a data structure that’s compatible with pandas.

Pandas data structures
There are two types of data structures in pandas: Series and DataFrames.

Series: a pandas Series is a one dimensional data structure (“a one dimensional ndarray”) that can store values — and for every value it holds a unique index, too.
It has to be remembered that unlike Python lists, a Series will always contain data of the same type.

In [3]:

  
# create Pandas Series with default index values 
# default index ranges is from 0 to len(list) - 1 
x = pd.Series(['Geeks', 'for', 'Geeks']) 
  
# print the Series 
print(x) 


0    Geeks
1      for
2    Geeks
dtype: object


DataFrame: a pandas DataFrame is a two (or more) dimensional data structure – basically a table with rows and columns. The columns have names and the rows have indexes.

Different ways to create Pandas Dataframe
Method #1: Creating Pandas DataFrame from lists of lists.

In [4]:
# initialize list of lists 
data = [['tom', 10], ['nick', 15], ['juli', 14]] 
  
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Name', 'Age']) 
  
# print dataframe. 
df 

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


Method #2: Creating DataFrame from dict of narray/lists
To create DataFrame from dict of narray/list, all the narray must be of same length. If index is passed then the length index should be equal to the length of arrays. If no index is passed, then by default, index will be range(n) where n is the array length.

In [5]:
# intialise data of lists. 
data = {'Name':['Tom', 'nick', 'krish', 'jack'], 'Age':[20, 21, 19, 18]} 
  
# Create DataFrame 
df = pd.DataFrame(data) 
  
# Print the output. 
df 

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


How to load iris dataset in DataFrame

In [6]:
from sklearn import datasets 


   
iris = datasets.load_iris() 
    # >>>iris["data"] 
predictors = iris.data[:, 0:2] 
outcomes = iris.target 

In [7]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [9]:
iris_data_frame=pd.DataFrame(iris.data,iris.target)

In [10]:
iris_data_frame

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
0,4.9,3.0,1.4,0.2
0,4.7,3.2,1.3,0.2
0,4.6,3.1,1.5,0.2
0,5.0,3.6,1.4,0.2
0,5.4,3.9,1.7,0.4
0,4.6,3.4,1.4,0.3
0,5.0,3.4,1.5,0.2
0,4.4,2.9,1.4,0.2
0,4.9,3.1,1.5,0.1


In [15]:
iris_data_frame[0]

0    5.1
0    4.9
0    4.7
0    4.6
0    5.0
0    5.4
0    4.6
0    5.0
0    4.4
0    4.9
0    5.4
0    4.8
0    4.8
0    4.3
0    5.8
0    5.7
0    5.4
0    5.1
0    5.7
0    5.1
0    5.4
0    5.1
0    4.6
0    5.1
0    4.8
0    5.0
0    5.0
0    5.2
0    5.2
0    4.7
    ... 
2    6.9
2    5.6
2    7.7
2    6.3
2    6.7
2    7.2
2    6.2
2    6.1
2    6.4
2    7.2
2    7.4
2    7.9
2    6.4
2    6.3
2    6.1
2    7.7
2    6.3
2    6.4
2    6.0
2    6.9
2    6.7
2    6.9
2    5.8
2    6.8
2    6.7
2    6.7
2    6.3
2    6.5
2    6.2
2    5.9
Name: 0, Length: 150, dtype: float64

In [16]:
iris_data_frame.iloc[0]

0    5.1
1    3.5
2    1.4
3    0.2
Name: 0, dtype: float64

In [17]:
iris_data_frame.iloc[0,3]

0.2

In [18]:
iris_data_frame.iloc[:,:]

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
0,4.9,3.0,1.4,0.2
0,4.7,3.2,1.3,0.2
0,4.6,3.1,1.5,0.2
0,5.0,3.6,1.4,0.2
0,5.4,3.9,1.7,0.4
0,4.6,3.4,1.4,0.3
0,5.0,3.4,1.5,0.2
0,4.4,2.9,1.4,0.2
0,4.9,3.1,1.5,0.1


In [19]:
iris_data_frame.iloc[:,:2]

Unnamed: 0,0,1
0,5.1,3.5
0,4.9,3.0
0,4.7,3.2
0,4.6,3.1
0,5.0,3.6
0,5.4,3.9
0,4.6,3.4
0,5.0,3.4
0,4.4,2.9
0,4.9,3.1


In [13]:
iris_data_frame.iloc[0,0]

5.1

Loading a .csv file into a pandas DataFrame
 load a .csv data file into pandas!
There is a function for it, called read_csv().

In [4]:
import pandas as pd
iris_pd=pd.read_csv('iris.csv')

In [6]:
iris_pd

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [8]:
iris_pd.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
iris_pd.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

how to use dataframe with KNN

In [14]:
predictors=iris_pd.iloc[:,:4]
targets=iris_pd.iloc[:,4]

In [15]:
from sklearn.neighbors import KNeighborsClassifier #predictions from skikit 
knn = KNeighborsClassifier(n_neighbors = 5) 
knn.fit(predictors, targets) 
sk_predictions = knn.predict(predictors) 

In [16]:
sk_predictions

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
     

In [20]:
print(" Accuracy: ") 
print(100 * np.mean(sk_predictions == targets)) 

 Accuracy: 
96.66666666666667
