# Pandas :
* Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. 

In [1]:
#Import the libraries
import numpy as np
import pandas as pd

## Dataframes

In [2]:
# Playing with Dataframe

df=pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],
                columns=["Column1","Column2","Column3","Coumn4"])

In [3]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Coumn4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [4]:
## Accessing the elements

df.loc['Row1']

Column1    0
Column2    1
Column3    2
Coumn4     3
Name: Row1, dtype: int32

In [5]:
df.iloc[:,:]

Unnamed: 0,Column1,Column2,Column3,Coumn4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [6]:
## Take the elements from the Column2
df.iloc[:,1:]

Unnamed: 0,Column2,Column3,Coumn4
Row1,1,2,3
Row2,5,6,7
Row3,9,10,11
Row4,13,14,15
Row5,17,18,19


In [7]:
#convert Dataframes into array
df.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [8]:
df['Column1'].value_counts()

12    1
4     1
16    1
8     1
0     1
Name: Column1, dtype: int64

In [9]:
#Top rows of df
df.tail(3)

Unnamed: 0,Column1,Column2,Column3,Coumn4
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [10]:
# check the datatype
df.dtypes

Column1    int32
Column2    int32
Column3    int32
Coumn4     int32
dtype: object

In [11]:
df['Column1'].values

array([ 0,  4,  8, 12, 16])

In [12]:
df[df['Column1']>7]

Unnamed: 0,Column1,Column2,Column3,Coumn4
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [13]:
#drop the column
df.drop("Column3",axis=1,inplace=True)
df.head()

Unnamed: 0,Column1,Column2,Coumn4
Row1,0,1,3
Row2,4,5,7
Row3,8,9,11
Row4,12,13,15
Row5,16,17,19


## Data Series

In [30]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

print ("Labels:", labels)
print("My data:", my_data)
print("Dictionary:", d)
pd.Series(my_data, index=labels)

Labels: ['a', 'b', 'c']
My data: [10, 20, 30]
Dictionary: {'a': 10, 'b': 20, 'c': 30}


a    10
b    20
c    30
dtype: int64

In [33]:
print ("\nHolding numerical data\n",'-'*25, sep='')
print(pd.Series(arr)[1])


Holding numerical data
-------------------------
20


In [34]:
print ("\nHolding text labels\n",'-'*20, sep='')
print(pd.Series(labels))


Holding text labels
--------------------
0    a
1    b
2    c
dtype: object


In [35]:
ser1 = pd.Series([1,2,3,4],index = [2,4,6,8])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
ser1

2    1
4    2
6    3
8    4
dtype: int64

In [36]:
ser1[0:3:2]

2    1
6    3
dtype: int64

In [38]:
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'CA'])
ser2 = pd.Series([1,2,5,4],['CA', 'NV', 'AZ','OR'])
ser3 = ser1+ser2
ser3

AZ    NaN
CA    2.0
CA    5.0
CO    NaN
NV    NaN
OR    6.0
dtype: float64

## Reading & Writing  files

In [14]:
#new df using the read_csv function 
txt_df=pd.read_csv('E:\\Profond AI\\Python\\Python\\Data.csv')

In [15]:
txt_df.head()

Unnamed: 0,OrderDate,Region,Rep,Item,Units,Unit Cost,Total
0,1-6-16,East,Jones,Pencil,95,1.99,189.05
1,1-23-16,Central,Kivell,Binder,50,19.99,999.5
2,2-9-16,Central,Jardine,Pencil,36,4.99,179.64
3,2-26-16,Central,Gill,Pen,27,19.99,539.73
4,3-15-16,West,Sorvino,Pencil,56,2.99,167.44


### In case of other delimiters 

In [16]:
#Use read_table function and declare the seperator
txt_df=pd.read_table('E:\\Profond AI\\Python\\Python\\Data.csv',sep=",")
txt_df.head()

  


Unnamed: 0,OrderDate,Region,Rep,Item,Units,Unit Cost,Total
0,1-6-16,East,Jones,Pencil,95,1.99,189.05
1,1-23-16,Central,Kivell,Binder,50,19.99,999.5
2,2-9-16,Central,Jardine,Pencil,36,4.99,179.64
3,2-26-16,Central,Gill,Pen,27,19.99,539.73
4,3-15-16,West,Sorvino,Pencil,56,2.99,167.44


## Understanding the data

In [17]:
txt_df.shape

(43, 7)

In [18]:
txt_df.describe()

Unnamed: 0,Units,Unit Cost
count,43.0,43.0
mean,49.325581,20.308605
std,30.078248,47.345118
min,2.0,1.29
25%,27.5,3.99
50%,53.0,4.99
75%,74.5,17.99
max,96.0,275.0


In [19]:
txt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 7 columns):
OrderDate    43 non-null object
Region       43 non-null object
Rep          43 non-null object
Item         43 non-null object
Units        43 non-null int64
Unit Cost    43 non-null float64
Total        43 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 2.4+ KB


In [20]:
txt_df.columns

Index(['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'Unit Cost', 'Total'], dtype='object')

In [21]:
#subset the dataframe
txt_df1 = txt_df[['OrderDate', 'Region', 'Rep', 'Item']]
txt_df1.head()

Unnamed: 0,OrderDate,Region,Rep,Item
0,1-6-16,East,Jones,Pencil
1,1-23-16,Central,Kivell,Binder
2,2-9-16,Central,Jardine,Pencil
3,2-26-16,Central,Gill,Pen
4,3-15-16,West,Sorvino,Pencil


In [22]:
#make the new column
txt_df['tot'] = txt_df['Units']*txt_df['Unit Cost']
txt_df.head()

Unnamed: 0,OrderDate,Region,Rep,Item,Units,Unit Cost,Total,tot
0,1-6-16,East,Jones,Pencil,95,1.99,189.05,189.05
1,1-23-16,Central,Kivell,Binder,50,19.99,999.5,999.5
2,2-9-16,Central,Jardine,Pencil,36,4.99,179.64,179.64
3,2-26-16,Central,Gill,Pen,27,19.99,539.73,539.73
4,3-15-16,West,Sorvino,Pencil,56,2.99,167.44,167.44


In [23]:
#Rename
txt_df = txt_df.rename(columns={'tot': 'Tot Sales'})
txt_df.head()

Unnamed: 0,OrderDate,Region,Rep,Item,Units,Unit Cost,Total,Tot Sales
0,1-6-16,East,Jones,Pencil,95,1.99,189.05,189.05
1,1-23-16,Central,Kivell,Binder,50,19.99,999.5,999.5
2,2-9-16,Central,Jardine,Pencil,36,4.99,179.64,179.64
3,2-26-16,Central,Gill,Pen,27,19.99,539.73,539.73
4,3-15-16,West,Sorvino,Pencil,56,2.99,167.44,167.44


## Write to dataframe to other location

In [24]:
#dataframe to csv file
txt_df.to_csv('first_df.csv')

## Reading JSON object

In [25]:
json_obj="""
{   "name":"John",
    "age":30,
    "cars": [
        { "name":"Ford", "models":[ "Fiesta", "Focus", "Mustang" ] },
        { "name":"BMW", "models":[ "320", "X3", "X5" ] },
        { "name":"Fiat", "models":[ "500", "Panda" ] }
    ]
 }
 """

In [26]:
import json

In [27]:
#Read the json object using loads function
json_df=pd.read_json(json_obj)

In [28]:
#View the new df
json_df

Unnamed: 0,name,age,cars
0,John,30,"{'name': 'Ford', 'models': ['Fiesta', 'Focus',..."
1,John,30,"{'name': 'BMW', 'models': ['320', 'X3', 'X5']}"
2,John,30,"{'name': 'Fiat', 'models': ['500', 'Panda']}"


### Convert json to csv

In [29]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
data.head()

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
data.to_csv('wine.csv')

In [None]:
# convert Json to different json formats

data.to_json(orient="index")

## Dealing with HTML 

In [None]:
#!pip install lxml
#!pip install beautifulsoup
#!pip install html5lib

from pandas import read_html
import lxml

In [None]:
url='http://stats.espncricinfo.com/ci/content/records/210099.html'

In [None]:
url_df = pd.read_html(url)

In [None]:
#Pick first object from list of dataframe objects
final_url_df=url_df[0]
final_url_df.head()

## Reading EXcel Files

In [None]:
df_excel=pd.read_excel('E:\\Profond AI\\Order_Data.xlsx')

In [None]:
df_excel.head()