# Handling Missing Data


In [2]:
import pandas as pd
import numpy as np
a=pd.Series(['a',np.nan,'b','c'])
#a
a.isnull()

0    False
1     True
2    False
3    False
dtype: bool

In [11]:
a[0]=None
a.isnull
#a.isnull()

<bound method Series.isnull of 0    None
1     NaN
2       b
3       c
dtype: object>

# Filtering Out Missing Data

In [3]:
from numpy import nan as NA
data=pd.Series([1,2,NA,4,NA])
#data
data.dropna()#it removes NAN values

0    1.0
1    2.0
3    4.0
dtype: float64

In [19]:
 data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
 [NA, NA, NA], [NA, 6.5, 3.]])
#data
data.dropna()# dropna by default drops any row containing a missing value:

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [22]:
data.dropna(how='all')#Passing how='all' will only drop rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [6]:
data[4] = NA
data
#data.dropna(axis=1,how='all')

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [8]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4,1]=NA
df.iloc[:2,2]=22
df
df.dropna()

Unnamed: 0,0,1,2
4,0.953158,-1.458805,-1.191979
5,-0.426994,-0.882009,-1.677754
6,-0.117794,1.281276,0.913861


# Filling In Missing Data


In [10]:
#df.fillna(3)#Calling fillna with a constant replaces missing values with that value
df.fillna({1:0.5})#Calling fillna with a dict, you can use a different fill value for each column:


Unnamed: 0,0,1,2
0,0.441396,0.5,22.0
1,-0.636202,0.5,22.0
2,0.429499,0.5,0.349851
3,-0.786206,0.5,-0.004903
4,0.953158,-1.458805,-1.191979
5,-0.426994,-0.882009,-1.677754
6,-0.117794,1.281276,0.913861


In [59]:
_ = df.fillna(0, inplace=True)#fillna returns a new object, but you can modify the existing object in-place:

df

Unnamed: 0,0,1,2
0,-1.507397,0.0,22.0
1,1.173449,0.0,22.0
2,0.371514,0.0,0.243216
3,0.324342,0.0,0.928739
4,0.507271,1.209919,-1.399121
5,0.627251,-0.952625,-0.827915
6,-0.884969,1.380859,-0.148281


In [11]:
d=pd.DataFrame(np.random.randn(6,3))
#d
d.iloc[2:,1]=NA
d.iloc[4:,0]=NA
d
d.fillna(method='ffill')



Unnamed: 0,0,1,2
0,-0.015207,-1.162627,-1.686018
1,0.171591,-0.355396,0.706822
2,0.34593,-0.355396,-1.184503
3,0.337927,-0.355396,-2.048317
4,0.337927,-0.355396,-0.020517
5,0.337927,-0.355396,0.573827


# Removing Duplicates

In [15]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
 'k2': [1, 1, 2, 3, 3, 4, 4]})
data.drop_duplicates(['k1'])


Unnamed: 0,k1,k2
0,one,1
1,two,1


# Transforming Data Using a Function or Mapping


In [78]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
  'Pastrami', 'corned beef', 'Bacon',
 'pastrami', 'honey ham', 'nova lox'],
 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [79]:
meat_to_animal = {
 'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'
}

In [80]:
lowercase=data['food'].str.lower()

In [82]:
data['animal']=lowercase.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


# Replacing Values

In [20]:
import pandas as pd
import numpy as np
p=pd.Series([1,1,3,4,4])
p
#p.replace(4,NA)# select single value replace
#p.replace([1,3],[0,44])#To use a different replacement for each value, pass a list of substitutes:
#p.replace([4,3],np.nan)
p.replace({3:np.nan,1:2})#The argument passed can also be a dict

0    2.0
1    2.0
2    NaN
3    4.0
4    4.0
dtype: float64

# Renaming Axis Indexes


In [10]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
#data
transform=lambda x:x[:2].upper()
data.columns.map(transform)
data.columns=data.columns.map(transform)
data

Unnamed: 0,ON,TW,TH,FO
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [25]:
#data.rename(index=str.title,columns=str.upper)
#data.rename(index={"OHIO":"ahaa","COLO":"karela"})
#data.rename(index={"Ohio":"HAMARA"},inplace=True)
data
#Should you wish to modify a dataset in-place,pass inplace=True:

Unnamed: 0,ON,TW,TH,FO
HAMARA,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


# Discretization and Binning


# Detecting and Filtering Outliers

In [18]:
data = pd.DataFrame(np.random.randn(100, 4))
#data.describe()
#col=data[2]
#col[np.abs(col)>2]
data[(np.abs(data) > 3).any(1)]
np.sign(data).head()#The statement np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative:

Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,-1.0,1.0,-1.0,-1.0
3,-1.0,1.0,1.0,1.0
4,1.0,-1.0,1.0,1.0


# Permutation and Random Sampling

In [29]:
#Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do
#using the numpy.random.permutation function. Calling permutation with the length
#of the axis you want to permute produces an array of integers indicating the new
#ordering:
df=pd.DataFrame(np.arange(4*3).reshape(3,4))
#df
sample=np.random.permutation(3)
sample
df.take(sample)
df.sample(n=2)#To select a random subset without replacement, you can use the sample method on Series and DataFrame:


Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


# Computing Indicator/Dummy Variables

In [31]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b','d'],'data1': range(7)})
df
dummy=pd.get_dummies(df['key'])
dummy

Unnamed: 0,a,b,c,d
0,0,1,0,0
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0
5,0,1,0,0
6,0,0,0,1


# String Manipulation


# String Object Methods

In [None]:
val='a,b, c'
print(val.split(','))
#split is often combined with strip to trim whitespace (including line breaks):
piece=[x.strip() for x in val.split(',')]
print(piece)
#These substrings could be concatenated together with a two-colon delimiter using addition:
#first,second,third=piece
#first+'::'+second+"::"+third
#'::'.join(piece)
val.replace(',', '::')


# Regular Expressions

In [32]:
#suppose we wanted to split a string with a variable number of whitespace characters (tabs, spaces, and newlines).
import re
f='for a\tnice\nowesome'
re.split('\s+',f)
#If, instead, you wanted to get a list of all patterns matching the regex, you can use the findall method:
regex = re.compile('\s+')
regex.findall(f)

[' ', '\t', '\n']

In [35]:
text = """Dave dave12@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
sdgai73887@oiuo

"""
pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

['dave12@google.com',
 'steve@gmail.com',
 'rob@gmail.com',
 'ryan@yahoo.com',
 'mank9991@gmail.com']

In [7]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
#regex.findall(text)
m = regex.match('m.salmank9991@gmail.com')
m.groups()

('m.salmank9991', 'gmail', 'com')

# Vectorized String Functions in pandas

In [13]:
import numpy as np
import pandas as pd
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
#data
data.str.contains('com')#check extension i think hahaha

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [17]:
import re
pattern='([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern,flags=re.IGNORECASE)


Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [19]:
data.str[:4]


Dave     dave
Steve    stev
Rob      rob@
Wes       NaN
dtype: object

In [2]:
import pandas as pd
d=pd.read_csv("example/data.csv")
d

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,45f5bc6721f4becab5c8d27ac6512dc96e12f4a93f2666...,2019-11,Metropolitan Police Service,Metropolitan Police Service,-0.790011,51.786902,On or near Church Court,E01017631,Aylesbury Vale 021E,Other crime,Under investigation,
1,,2019-11,Metropolitan Police Service,Metropolitan Police Service,0.134947,51.588063,On or near Mead Grove,E01000027,Barking and Dagenham 001A,Anti-social behaviour,,
2,,2019-11,Metropolitan Police Service,Metropolitan Police Service,0.134947,51.588063,On or near Mead Grove,E01000027,Barking and Dagenham 001A,Anti-social behaviour,,
3,,2019-11,Metropolitan Police Service,Metropolitan Police Service,0.134947,51.588063,On or near Mead Grove,E01000027,Barking and Dagenham 001A,Anti-social behaviour,,
4,,2019-11,Metropolitan Police Service,Metropolitan Police Service,0.134947,51.588063,On or near Mead Grove,E01000027,Barking and Dagenham 001A,Anti-social behaviour,,
...,...,...,...,...,...,...,...,...,...,...,...,...
91199,20447b78e118a8d00be0123c3e78b346c0d36e5a665393...,2019-11,Metropolitan Police Service,Metropolitan Police Service,,,No Location,,,Other crime,Under investigation,
91200,9812d5a6065c0b9d37e9312b318ffa3ecc79018b704233...,2019-11,Metropolitan Police Service,Metropolitan Police Service,,,No Location,,,Other crime,Under investigation,
91201,241aed6f0b0f00c68e55add679202d0a9e1d3876d4e91f...,2019-11,Metropolitan Police Service,Metropolitan Police Service,,,No Location,,,Other crime,Under investigation,
91202,6a56c17c365db3653e7ad3df02a43ea99533b07a6fae55...,2019-11,Metropolitan Police Service,Metropolitan Police Service,,,No Location,,,Other crime,Under investigation,


In [6]:
d.Month

0        2019-11
1        2019-11
2        2019-11
3        2019-11
4        2019-11
          ...   
91199    2019-11
91200    2019-11
91201    2019-11
91202    2019-11
91203    2019-11
Name: Month, Length: 91204, dtype: object