In [1]:
#importing pandas
import pandas as pd
#controlling the output format

pd.set_option('display.max_columns',10)
pd.set_option('display.max_rows',10)
pd.set_option('display.width',60)
pd.set_option('display.notebook_repr_html',True)

In [2]:
#creating Series
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
#Accessing elements in the series
s[1]

2

In [4]:
#accessing multiple values in the series
s[[2,4]]

2    3
4    5
dtype: int64

In [5]:
#A series object can be created using a user defined index by using the Index parameter
a = pd.Series([3,2,1,4,5],index = ['a','b','c','d','e'])
a

a    3
b    2
c    1
d    4
e    5
dtype: int64

In [6]:
#now we can access using the alphanumeric index
a[['b','e']]

b    2
e    5
dtype: int64

In [7]:
#passing a list of integers to a Series of a non integer index will look up based upon 0-based index like an array
a[[1,2]]

  a[[1,2]]


b    2
c    1
dtype: int64

In [8]:
#we can examine the index of a series using the index property
a.index


Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
#create an index who's index is a series of dates between the specified dates
dates = pd.date_range('2023-01-01','2023-01-06')
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03',
               '2023-01-04', '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
#the above has created a special index that is DatetimeIndex
#Create a series which will represent temperatures for each date in the series
temps = pd.Series([80,82,85,90,83,87],index=dates)

In [11]:
temps

2023-01-01    80
2023-01-02    82
2023-01-03    85
2023-01-04    90
2023-01-05    83
2023-01-06    87
Freq: D, dtype: int64

In [12]:
# Looking up temperature of a specific date by:
temps['2023-01-05']

83

In [13]:
temps1 = pd.Series([70,75,69,83,79,77],index = dates)
#calculate the difference on those matching lables
temp_diff = temps-temps1
temp_diff

2023-01-01    10
2023-01-02     7
2023-01-03    16
2023-01-04     7
2023-01-05     4
2023-01-06    10
Freq: D, dtype: int64

In [14]:
#since the index is a non-integer we can also use the 0-based index to access values like so:
temp_diff[0]

  temp_diff[0]


10

In [15]:
#finding the mean
temp_diff.mean()

9.0

In [16]:
#Pandas DataFrame.
#in a Series only one value can be associated with a certain index if we wanted multiple values to be associated with an index we can use a DataFrame
#Each series will be a columns in a dataframe


In [17]:
#creating a Dataframe from the two series and giving them column names
temps_df = pd.DataFrame({
    "Nairobi" : temps,
    "Kiambu" : temps1
})
temps_df

Unnamed: 0,Nairobi,Kiambu
2023-01-01,80,70
2023-01-02,82,75
2023-01-03,85,69
2023-01-04,90,83
2023-01-05,83,79
2023-01-06,87,77


In [18]:
#columns in a Dataframe can be accessed using an indexer with the name of column or a list of column names
temps_df["Nairobi"]

2023-01-01    80
2023-01-02    82
2023-01-03    85
2023-01-04    90
2023-01-05    83
2023-01-06    87
Freq: D, Name: Nairobi, dtype: int64

In [19]:
temps_df["Kiambu"]

2023-01-01    70
2023-01-02    75
2023-01-03    69
2023-01-04    83
2023-01-05    79
2023-01-06    77
Freq: D, Name: Kiambu, dtype: int64

In [20]:
temps_df[["Nairobi","Kiambu"]]

Unnamed: 0,Nairobi,Kiambu
2023-01-01,80,70
2023-01-02,82,75
2023-01-03,85,69
2023-01-04,90,83
2023-01-05,83,79
2023-01-06,87,77


In [21]:
temps_df.Nairobi

2023-01-01    80
2023-01-02    82
2023-01-03    85
2023-01-04    90
2023-01-05    83
2023-01-06    87
Freq: D, Name: Nairobi, dtype: int64

In [22]:
temps_df.Kiambu

2023-01-01    70
2023-01-02    75
2023-01-03    69
2023-01-04    83
2023-01-05    79
2023-01-06    77
Freq: D, Name: Kiambu, dtype: int64

In [23]:
temps_df.Nairobi - temps_df.Kiambu

2023-01-01    10
2023-01-02     7
2023-01-03    16
2023-01-04     7
2023-01-05     4
2023-01-06    10
Freq: D, dtype: int64

In [24]:
temps_df["Differences"] = temps_df.Nairobi - temps_df.Kiambu

In [25]:
temps_df

Unnamed: 0,Nairobi,Kiambu,Differences
2023-01-01,80,70,10
2023-01-02,82,75,7
2023-01-03,85,69,16
2023-01-04,90,83,7
2023-01-05,83,79,4
2023-01-06,87,77,10


In [26]:
#Names of columns in the Dataframe are accessible via the columns property
temps_df.columns

Index(['Nairobi', 'Kiambu', 'Differences'], dtype='object')

In [27]:
# Accessing values in a columns can be done using the following way
temps_df.Differences[:]

2023-01-01    10
2023-01-02     7
2023-01-03    16
2023-01-04     7
2023-01-05     4
2023-01-06    10
Freq: D, Name: Differences, dtype: int64

In [28]:
temps_df.Differences[1:3]

2023-01-02     7
2023-01-03    16
Freq: D, Name: Differences, dtype: int64

In [29]:
#To retrieve Entire columns from the Dataframe one can use the .loc and .iloc property
# .loc ensures that the look up is by index.
# .iloc ensures that the look up is by the 0-based position
temps_df.iloc[1]

Nairobi        82
Kiambu         75
Differences     7
Name: 2023-01-02 00:00:00, dtype: int64

In [30]:
#retrieving using the index label 
temps_df.loc['2023-01-03']

Nairobi        85
Kiambu         69
Differences    16
Name: 2023-01-03 00:00:00, dtype: int64

In [31]:
temps_df.iloc[[1,3,4]].Differences

2023-01-02    7
2023-01-04    7
2023-01-05    4
Name: Differences, dtype: int64

In [32]:
# Rows of a data frame can be selected based on the logical expression that is applied to the data in each row
temps_df.Nairobi >60

2023-01-01    True
2023-01-02    True
2023-01-03    True
2023-01-04    True
2023-01-05    True
2023-01-06    True
Freq: D, Name: Nairobi, dtype: bool

In [33]:
temps_df.Nairobi>80

2023-01-01    False
2023-01-02     True
2023-01-03     True
2023-01-04     True
2023-01-05     True
2023-01-06     True
Freq: D, Name: Nairobi, dtype: bool

In [34]:
#This will return where Nairobi's temperature is greater than 80
temps_df[temps_df.Nairobi>80]

Unnamed: 0,Nairobi,Kiambu,Differences
2023-01-02,82,75,7
2023-01-03,85,69,16
2023-01-04,90,83,7
2023-01-05,83,79,4
2023-01-06,87,77,10


In [35]:
import zipfile
with zipfile.ZipFile('./archive.zip','r') as z:
    z.extractall('./data')

In [36]:
#Reading a file using pandas
df = pd.read_csv('./data/housing_price_dataset.csv')

In [37]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [38]:
df["SquareFeet"][0]

2126

In [39]:
df.index

RangeIndex(start=0, stop=50000, step=1)

In [40]:
df.shape

(50000, 6)

In [41]:
#Creating Series from python lists, numpy functions and dictionaries
a = pd.Series([11,12,13,14,15])
a

0    11
1    12
2    13
3    14
4    15
dtype: int64

In [42]:
#creating using a list function
b = pd.Series(list('abcde'))
b

0    a
1    b
2    c
3    d
4    e
dtype: object

In [43]:
#forming using numpy functions
#np.arange generates number between the specified (in this case 4 and 10 ) 4 is inclusive while 10 is exclusive
import numpy as np
c = pd.Series(np.arange(4,10))
c

0    4
1    5
2    6
3    7
4    8
5    9
dtype: int32

In [44]:
#linspace on the other hand works like arange although we can specify the number we want numpy to generate for us
d = pd.Series(np.linspace(5,9,7))
d

0    5.000000
1    5.666667
2    6.333333
3    7.000000
4    7.666667
5    8.333333
6    9.000000
dtype: float64

In [45]:
#Generating random numbers in python
np.random.seed(2024)
e = pd.Series(np.random.normal(size=5))

In [46]:
e

0    1.668047
1    0.737348
2   -0.201538
3   -0.150912
4    0.916052
dtype: float64

In [47]:
#There are propertis that we can explore
e.values

array([ 1.66804732,  0.73734773, -0.20153776, -0.15091195,  0.91605181])

In [48]:
e.index

RangeIndex(start=0, stop=5, step=1)

In [49]:
e.shape


(5,)

In [50]:
df.shape

(50000, 6)

In [51]:
# A ten items series

f = pd.Series(np.arange(1,10),
              index = list('abcdefghi'
                          ))
f

a    1
b    2
c    3
d    4
e    5
f    6
g    7
h    8
i    9
dtype: int32

In [52]:
f['a']

1

In [53]:
f[['c','g']]

c    3
g    7
dtype: int32

In [54]:
f.loc['b']

2

In [55]:
f.iloc[[2,3]]

c    3
d    4
dtype: int32

In [56]:
f[[2,3]]

  f[[2,3]]


c    3
d    4
dtype: int32

In [57]:
f.loc[['a','k']]

KeyError: "['k'] not in index"

In [58]:
#slicing Data 
f[:]

a    1
b    2
c    3
d    4
e    5
f    6
g    7
h    8
i    9
dtype: int32

In [64]:
f.iloc[-4:]

f    6
g    7
h    8
i    9
dtype: int32

In [66]:
# This will start with the value at index 4 and subtract index by 2 and returining that value
f.iloc[4::-2]

e    5
c    3
a    1
dtype: int32

In [67]:
#it is also possible to index a non integer index
f[1:3]

b    2
c    3
dtype: int32

In [68]:
# modifying an index in place
f['a'] = 20

In [69]:
f

a    20
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
dtype: int32

In [72]:
#Create a DataFrame from a 2-D Array
df_1 = pd.DataFrame(np.array([[11,12],
                             [13,14]]))
df_1

Unnamed: 0,0,1
0,11,12
1,13,14


In [76]:
df = pd.DataFrame(np.array([[80,89],[72,80]]),columns = ["Kiambu","Nairobi"])

In [77]:
df

Unnamed: 0,Kiambu,Nairobi
0,80,89
1,72,80


In [78]:
# To find the length or the number of rows we can use the len function
len(df)

2

In [79]:
# We can also find the dimensionality which is the number of rows * the Number of the columns
df.shape

(2, 2)

In [81]:
df.mean()

Kiambu     76.0
Nairobi    84.5
dtype: float64

In [82]:
df.sum()

Kiambu     152
Nairobi    169
dtype: int64

In [83]:
# Creating a DataFrame from a python Dictionary
temps_1 = [23,12,45]
temps_2 = [43,12,42]
towns = ["Nairobi","Limuru"]
dic = {
    towns[0] : temps_1,
    towns[1] : temps_2
}
dic

{'Nairobi': [23, 12, 45], 'Limuru': [43, 12, 42]}

In [84]:
df_2 = pd.DataFrame(dic)

In [85]:
df_2

Unnamed: 0,Nairobi,Limuru
0,23,43
1,12,12
2,45,42


In [91]:
temps_at_time1 = pd.Series([89,78,90,32])
temps_at_time2 = pd.Series([39,68,90,52])
df = pd.DataFrame([temps_at_time1,temps_at_time2])

In [92]:
df

Unnamed: 0,0,1,2,3
0,89,78,90,32
1,39,68,90,52


In [94]:
df_3 = pd.DataFrame([temps_at_time1,temps_at_time2])
df_3

Unnamed: 0,0,1,2,3
0,89,78,90,32
1,39,68,90,52


In [96]:
 df.columns = ["Mombasa","Kitui","Muranga","Juja"]

In [97]:
df

Unnamed: 0,Mombasa,Kitui,Muranga,Juja
0,89,78,90,32
1,39,68,90,52


In [98]:
df_4 = pd.DataFrame([temps_at_time1,temps_at_time2],
                   columns = ["Mombasa","Kitui","Muranga","Juja"])

In [99]:
df_4

Unnamed: 0,Mombasa,Kitui,Muranga,Juja
0,,,,
1,,,,


In [100]:
# To rectify this we can use the columns property
df_5 = pd.DataFrame([temps_at_time1,temps_at_time2])
df_5.columns = ["Mombasa","Kitui","Muranga","Juja"]
df_5

Unnamed: 0,Mombasa,Kitui,Muranga,Juja
0,89,78,90,32
1,39,68,90,52


In [102]:
# Manipulating the DataFrame structure
## RENAMING COLUMNS ##
df = pd.read_csv("./data/housing_price_dataset.csv")

In [103]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms',
       'Neighborhood', 'YearBuilt', 'Price'],
      dtype='object')

In [104]:
df = df.rename(columns = { 'Bedrooms' : 'BedRooms'})

In [105]:
df.columns


Index(['SquareFeet', 'BedRooms', 'Bathrooms',
       'Neighborhood', 'YearBuilt', 'Price'],
      dtype='object')

In [106]:
df.head()

Unnamed: 0,SquareFeet,BedRooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [107]:
# to make the change in-place without having to make a copy we can:
df.rename(columns = {'Bathrooms':'BathRooms'},inplace=True)

In [108]:
df.columns

Index(['SquareFeet', 'BedRooms', 'BathRooms',
       'Neighborhood', 'YearBuilt', 'Price'],
      dtype='object')

In [109]:
df[:]

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


In [110]:
df[:5]

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [112]:
#Adding new columns
df["RoundedPrice"] = df.Price.round()

In [113]:
df.columns

Index(['SquareFeet', 'BedRooms', 'BathRooms',
       'Neighborhood', 'YearBuilt', 'Price',
       'RoundedPrice'],
      dtype='object')

In [114]:
df[:5]

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price,RoundedPrice
0,2126,4,1,Rural,1969,215355.283618,215355.0
1,2459,3,2,Rural,1980,195014.221626,195014.0
2,1860,2,1,Suburb,1970,306891.012076,306891.0
3,2294,2,1,Urban,1996,206786.787153,206787.0
4,2130,5,2,Suburb,2001,272436.239065,272436.0


In [115]:
df.insert(6,'PER',0.0)

In [116]:
df.columns

Index(['SquareFeet', 'BedRooms', 'BathRooms',
       'Neighborhood', 'YearBuilt', 'Price', 'PER',
       'RoundedPrice'],
      dtype='object')

In [118]:
df[:3]

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price,PER,RoundedPrice
0,2126,4,1,Rural,1969,215355.283618,0.0,215355.0
1,2459,3,2,Rural,1980,195014.221626,0.0,195014.0
2,1860,2,1,Suburb,1970,306891.012076,0.0,306891.0


In [119]:
#Dropping columns from a dataFrame
df.drop('PER',axis=1)

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price,RoundedPrice
0,2126,4,1,Rural,1969,215355.283618,215355.0
1,2459,3,2,Rural,1980,195014.221626,195014.0
2,1860,2,1,Suburb,1970,306891.012076,306891.0
3,2294,2,1,Urban,1996,206786.787153,206787.0
4,2130,5,2,Suburb,2001,272436.239065,272436.0
...,...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895,100081.0
49996,2854,2,2,Suburb,1988,374507.656727,374508.0
49997,2979,5,3,Suburb,1962,384110.555590,384111.0
49998,2596,5,2,Rural,1984,380512.685957,380513.0


In [120]:
df.columns

Index(['SquareFeet', 'BedRooms', 'BathRooms',
       'Neighborhood', 'YearBuilt', 'Price', 'PER',
       'RoundedPrice'],
      dtype='object')

In [121]:
df = df.drop('PER',axis=1)

In [122]:
df

Unnamed: 0,SquareFeet,BedRooms,BathRooms,Neighborhood,YearBuilt,Price,RoundedPrice
0,2126,4,1,Rural,1969,215355.283618,215355.0
1,2459,3,2,Rural,1980,195014.221626,195014.0
2,1860,2,1,Suburb,1970,306891.012076,306891.0
3,2294,2,1,Urban,1996,206786.787153,206787.0
4,2130,5,2,Suburb,2001,272436.239065,272436.0
...,...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895,100081.0
49996,2854,2,2,Suburb,1988,374507.656727,374508.0
49997,2979,5,3,Suburb,1962,384110.555590,384111.0
49998,2596,5,2,Rural,1984,380512.685957,380513.0


In [123]:
rounded = df.pop("RoundedPrice")

In [124]:
rounded

0        215355.0
1        195014.0
2        306891.0
3        206787.0
4        272436.0
           ...   
49995    100081.0
49996    374508.0
49997    384111.0
49998    380513.0
49999    221619.0
Name: RoundedPrice, Length: 50000, dtype: float64

In [125]:
#pop will remove the last column but will return it as a series