# Pandas Introduction

### Importing Pandas Library

In [None]:
import pandas as pd

### Dataframes and Series

### Series

In [None]:
labels = ['a','b','c']
my_list = [10,20,30]
#arr = np.array(my_list)

data = {
   'a':10,
    'b':20,
    'c':30
}

In [None]:
pd.Series(data=my_list)

In [None]:
pd.Series(data=my_list, index=labels)

### DataFrame

In [None]:
# Example 1

from numpy.random import randn

df = pd.DataFrame(randn(5,4), index='A B C D E'.split(), columns='W X Y Z'.split())
df

# Data Manipulation

### Accessing items

In [8]:
# Using loc

df.loc['D','W']

-0.7046906889900866

In [9]:
# using iloc

df.iloc[3,0]

-0.7046906889900866

In [10]:
df['Y']['C']

0.18216562907277875

### Adding a New Column

In [11]:
df['New'] =df['W']+df['X']
df

Unnamed: 0,W,X,Y,Z,New
A,0.253802,-0.195478,-0.631533,-0.27137,0.058324
B,-1.699824,-0.190895,-1.744967,0.579166,-1.890719
C,-0.497125,-0.540872,0.182166,-0.992132,-1.037997
D,-0.704691,0.49457,-1.671572,-0.932578,-0.210121
E,1.59898,0.863058,-1.373918,0.15034,2.462038


### Dropping a Column

In [12]:
df.drop("New", axis=1, inplace=False)


Unnamed: 0,W,X,Y,Z
A,0.253802,-0.195478,-0.631533,-0.27137
B,-1.699824,-0.190895,-1.744967,0.579166
C,-0.497125,-0.540872,0.182166,-0.992132
D,-0.704691,0.49457,-1.671572,-0.932578
E,1.59898,0.863058,-1.373918,0.15034


In [13]:
df.drop("E" , axis=0,inplace=True)
df

Unnamed: 0,W,X,Y,Z,New
A,0.253802,-0.195478,-0.631533,-0.27137,0.058324
B,-1.699824,-0.190895,-1.744967,0.579166,-1.890719
C,-0.497125,-0.540872,0.182166,-0.992132,-1.037997
D,-0.704691,0.49457,-1.671572,-0.932578,-0.210121


## Dealing with missing Values

In [14]:
# Example 2
import numpy as np

df = pd.DataFrame({'A':[1,np.nan, 2, np.nan],
                  'B':[5, np.nan,np.nan,6],
                  'C':[1,2,3,np.nan],
                 'D':[4,5,6,7]})
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,4
1,,,2.0,5
2,2.0,,3.0,6
3,,6.0,,7


###  1. Drop the missing values

In [15]:
df.drop(axis=1, columns=["A","B","C"])

Unnamed: 0,D
0,4
1,5
2,6
3,7


In [16]:
df.dropna(axis=1,thresh=3)

Unnamed: 0,C,D
0,1.0,4
1,2.0,5
2,3.0,6
3,,7


In [17]:
# When working with big data use:
df.dropna(axis=1,thresh=0.7*len(df)) #BEST SOLUTION

Unnamed: 0,C,D
0,1.0,4
1,2.0,5
2,3.0,6
3,,7


### 2. Fill them with another value

In [18]:
df.fillna("New")

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,4
1,New,New,2.0,5
2,2.0,New,3.0,6
3,New,6.0,New,7


In [19]:
df.fillna(value =df['A'].mean())

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,4
1,1.5,1.5,2.0,5
2,2.0,1.5,3.0,6
3,1.5,6.0,1.5,7


In [28]:
df1 = pd.DataFrame({
    'A':[1,2,3,4],
    'B':[5,6,7,8],
    'C':[9,10,11,12],
    'D':[13,14,15,16]
})

df2 = pd.DataFrame({
    'A':[1,2,3,4],
    'B':[5,6,7,8],
    'C':[9,10,11,12],
    'D':[13,14,15,16]
})

df3 = pd.DataFrame({
    'A':[1,2,3,4],
    'B':[5,6,7,8],
    'C':[9,10,11,12],
    'D':[13,14,15,16]
})

In [36]:
frames = [df1, df2, df3]

result = pd.concat(frames, keys=["x", "y", "z"])
result

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,1,5,9,13
x,1,2,6,10,14
x,2,3,7,11,15
x,3,4,8,12,16
y,0,1,5,9,13
y,1,2,6,10,14
y,2,3,7,11,15
y,3,4,8,12,16
z,0,1,5,9,13
z,1,2,6,10,14


In [37]:
frames = [df1, df2, df3]

result = pd.concat(frames, keys=["x", "y", "z"], axis=1)

In [34]:
result

Unnamed: 0_level_0,x,x,x,x,y,y,y,y,z,z,z,z
Unnamed: 0_level_1,A,B,C,D,A,B,C,D,A,B,C,D
0,1,5,9,13,1,5,9,13,1,5,9,13
1,2,6,10,14,2,6,10,14,2,6,10,14
2,3,7,11,15,3,7,11,15,3,7,11,15
3,4,8,12,16,4,8,12,16,4,8,12,16


## Reading Data

In [31]:

data = pd.read_csv('USA_Housing.csv')

### Exploring data

In [24]:
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [39]:
data.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [44]:
# check null values

data.isnull().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64