In [1]:
# Let's load a new dataset on the number of fires in the Amazon rainforest 

import pandas as pd

file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv"
df = pd.read_csv(file_name, encoding = "ISO-8859-1")

df.tail()

Unnamed: 0,ano,mes,estado,numero,encontro
6449,2012,Dezembro,Tocantins,128,1/1/2012
6450,2013,Dezembro,Tocantins,85,1/1/2013
6451,2014,Dezembro,Tocantins,223,1/1/2014
6452,2015,Dezembro,Tocantins,373,1/1/2015
6453,2016,Dezembro,Tocantins,119,1/1/2016


In [2]:
new_columns = {'ano' : 'year',
               'estado': 'state',
               'mes': 'month',
               'numero': 'number_of_fires',
               'encontro': 'date'}

df.rename(columns = new_columns, inplace=True)

In [3]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             6454 non-null   int64 
 1   month            6454 non-null   object
 2   state            6454 non-null   object
 3   number_of_fires  6322 non-null   object
 4   date             6454 non-null   object
dtypes: int64(1), object(4)
memory usage: 252.2+ KB


In [5]:
# Columns are numbered from 0, left to right
# Let's put date first, month second and year 3rd

new_order = [4,1,0,2,3,]
df = df[df.columns[new_order]]
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0 Fires
1,1/1/1999,Janeiro,1999,Acre,0 Fires
2,1/1/2000,Janeiro,2000,Acre,0 Fires
3,1/1/2001,Janeiro,2001,Acre,0 Fires
4,1/1/2002,Janeiro,2002,Acre,0 Fires


In [6]:
df['number_of_fires'].str.strip(" Fires")

0         0
1         0
2         0
3         0
4         0
       ... 
6449    128
6450     85
6451    223
6452    373
6453    119
Name: number_of_fires, Length: 6454, dtype: object

In [7]:
# To replace column with cleaned column

df['number_of_fires'] = df['number_of_fires'].str.strip(" Fires")
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0
1,1/1/1999,Janeiro,1999,Acre,0
2,1/1/2000,Janeiro,2000,Acre,0
3,1/1/2001,Janeiro,2001,Acre,0
4,1/1/2002,Janeiro,2002,Acre,0


In [8]:
# We need to convert our number_of_fires column to a float data type
# Also, here's an alternative string manipulation technique we can use

df["number_of_fires"] = df["number_of_fires"].str.replace('','0').astype(float)
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0.0
1,1/1/1999,Janeiro,1999,Acre,0.0
2,1/1/2000,Janeiro,2000,Acre,0.0
3,1/1/2001,Janeiro,2001,Acre,0.0
4,1/1/2002,Janeiro,2002,Acre,0.0


In [9]:
df_copy = df.copy()

In [10]:
# Viewing the sum of missing values in each column

df.isnull().sum()

date                 0
month                0
year                 0
state                0
number_of_fires    132
dtype: int64

In [11]:
# We can easily remove Null or NaN (not a number) values 

# Drop rows with NaN values
df = df.dropna() 
df = df.reset_index() # reset's row indexes in case any rows were dropped
df.head()

Unnamed: 0,index,date,month,year,state,number_of_fires
0,0,1/1/1998,Janeiro,1998,Acre,0.0
1,1,1/1/1999,Janeiro,1999,Acre,0.0
2,2,1/1/2000,Janeiro,2000,Acre,0.0
3,3,1/1/2001,Janeiro,2001,Acre,0.0
4,4,1/1/2002,Janeiro,2002,Acre,0.0


In [12]:
df.isnull().sum()

index              0
date               0
month              0
year               0
state              0
number_of_fires    0
dtype: int64