In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
titanic = pd.read_csv("C:/Users/user/Downloads/titanic.csv")
titanic.shape

(891, 12)

In [4]:
titanic.count()          # count: this tells the number of valid observations for either a data frame, or for a single variable.And “valid” here means anything but not np.nan. So the number of valid observations for each variable is

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [5]:
titanic.isna().sum()      # method in Pandas is a super handy tool for detecting missing values in your data. 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
titanic.SibSp.count()

np.int64(891)

In [7]:
t = titanic.dropna(subset=["Age", "Parch"]) # dropna-it will remove the zero values nothing but NAN values in the column Age and Parch.
t.shape

(714, 12)

In [8]:
t = titanic.dropna()
t.shape

(183, 12)

In [9]:
titanic.Age.min(), titanic.Age.max()

(0.42, 80.0)

In [10]:
# Excerise 6.1
titanic.Fare.isna().sum()

np.int64(0)

In [11]:
titanic.Fare.dtype

dtype('float64')

In [12]:
titanic.Fare.min(), titanic.Fare.max()

(0.0, 512.3292)

In [13]:
populations = pd.Series([32.7, 267.7, np.nan], index=["MY", "ID", "KH"])
populations

MY     32.7
ID    267.7
KH      NaN
dtype: float64

In [14]:
populations[populations.isna()] = 0
populations

MY     32.7
ID    267.7
KH      0.0
dtype: float64

In [15]:
populations.fillna(method="ffill")            # use "fillna" method to replace all missings with certain values.

  populations.fillna(method="ffill")            # use "fillna" method to replace all missings with certain values.


MY     32.7
ID    267.7
KH      0.0
dtype: float64

In [16]:
populations = pd.DataFrame({"population": [23_618_200, 23_674_546,
                                           np.nan, 94_600_648],
                            "country": ["TW", "TW", "VN", "VN"]}, 
                        index=[2016, 2017, 2016, 2017])  

In [17]:
populations.fillna(method="ffill")

  populations.fillna(method="ffill")


Unnamed: 0,population,country
2016,23618200.0,TW
2017,23674546.0,TW
2016,23674546.0,VN
2017,94600648.0,VN


In [18]:
populations.groupby("country").fillna(method="ffill")

  populations.groupby("country").fillna(method="ffill")
  populations.groupby("country").fillna(method="ffill")


Unnamed: 0,population
2016,23618200.0
2017,23674546.0
2016,
2017,94600648.0


In [19]:
# missing values and mathematical operations
age = pd.Series([5, 25, np.nan])       # pandas .sum and .mean methods will ignore the missing values.

In [20]:
age.mean()

np.float64(15.0)

In [21]:
age.sum()/len(age)    # len does not ignore missing values

np.float64(10.0)

In [22]:
child = age < 14
child

0     True
1    False
2    False
dtype: bool

In [23]:
(age < 14).mean()

np.float64(0.3333333333333333)

In [24]:
# Converting variables 
# Converting Categorial variables 
wvs = pd.DataFrame({"kids": [0,2,4,8,-1,-2]})
wvs

Unnamed: 0,kids
0,0
1,2
2,4
3,8
4,-1
5,-2


In [None]:
# pd.cut function
# it is a dedicated pandas’ function to cut continuous numeric data into pre-defined intervals.
# x is the variable to be cut, bins are the interval boundaries, labels are the interval names, and right tells if the right boundary value 
# is included in the interval (right = True means the boundaries are righ-open.
# output:-(0,30]-Open on left, closed on right   Includes 30, excludes 0
# (30, 60]-Open on left, closed on right         Includes 60, excludes 30
# (60, 100]-Open on left, closed on right        Includes 100, excludes 60

In [26]:
x = [23, 45, 67, 89, 12, 34, 56]
bins = [0, 30, 60, 100]       
pd.cut(x, bins, right=True, labels=None) 

[(0, 30], (30, 60], (60, 100], (60, 100], (0, 30], (30, 60], (30, 60]]
Categories (3, interval[int64, right]): [(0, 30] < (30, 60] < (60, 100]]

In [None]:
wvs["kids1"] = pd.cut(wvs.kids,
                      bins = [-np.inf, 0, 1, 2, np.inf],
                      labels = ["NA", "0", "1", "2 or more"],
                      right=False)                             
ws

In [None]:
# Exercise 6.2
# test on years of schooling 10-17
school=np.arange(10,18)
# convert to categories
categories=pd.cut(school,
                    bins=[-np.inf, 12, 13, 16, np.inf],
                    labels=["Less than HS", "HS", "Some college", "College"],
                    right=False)
# print in a way that years and categories are next to each other
pd.Series(categories, index=school)

In [None]:
wvs["kids2"]=np.where(wvs.kids>2,2,wvs.kids)           # np.where, a vectorized version of if-else statement. np.where takes three arguments, a vectorized logical condition, value if the condition is true, and value if it is false
wvs["kids2"]=np.where(wvs.kids<0, np.nan,wvs.kids2)
wvs

In [None]:
# convert the categories to text as we cannot encode “2 or more” as a number. This can be achieved by just adding .astype(str)
wvs["kids2"]=np.where(wvs.kids>=2, "2 or more" , wvs. kids.astype(str))
wvs["kids2"]=np.where(wvs.kids<0, np.nan,wvs.kids2)
wvs                                                  # "kids2 column is now object, a mixed type"

In [None]:
# Replacing selected values in dataframe
# will index the rows by a logical vector (logical condition) and columns with its name. This can be achieved with ".loc"
wvs["kids4"]=wvs.kids.astype(str)
wvs.loc[wvs.kids>=2,"kids4"]="2 or more"
wvs.loc[wvs.kids<0,"kids4"]=np.nan
wvs

In [None]:
# Converting categorical variables to dummies      categorical to vector of numbers
# pd.get_dummies to convert  categorical  to dummies
import pandas as pd
males=pd.read_csv("C:/Users/user/Downloads/dataset-54234.csv")
males

In [None]:
print(males.columns)

In [None]:
males = pd.read_csv("C:/Users/user/Downloads/dataset-54234.csv")\
    [["wage", "ethn", "married"]].sample(10)
males

In [None]:
pd.get_dummies(males.married)        # The function "pd.get_dummies" can convert a series into data frames of dummies.

In [None]:
race=pd.get_dummies(males.ethn,prefix="race")
race

In [None]:
# we want to include all dummies but one–the reference category. One can drop it manually, or alternatively use the "drop_first" option.
race.drop("race_other",axis=1)

In [None]:
pd.get_dummies(males.ethn, prefix="race", drop_first=True)

In [None]:
pd.get_dummies(males, drop_first=True)     # want to drop one of the dummy levels with drop_first argument.

In [None]:
# Excerise 6.3
# converting categorical variables to dummies
residence = pd.get_dummies(males.residence, prefix="R", prefix_sep="")         # prefix_sep=""- means there’s no separator between the prefix and the category name. So if one category is "south", the new column will be named "Rsouth".
residence.drop("Rsouth", axis=1).sample(8)                                      # it removes the r-south column           
#residence

In [None]:
# "pd.get_dummies" can also convert numerical variables to dummies.
df = pd.DataFrame({"name":["abc", "def", "ghi", "jkl"],
                   "education":[1, 3, 2, 2],
                   "role":["doctor", "nurse", "doctor", "doctor"]
})
df

In [None]:
pd.get_dummies(df.education)

In [None]:
pd.get_dummies(df, columns=["education","role"])

In [None]:
titanic=pd.read_csv("C:/Users/user/Downloads/titanic.csv")
print(titanic.columns)

In [None]:
# Exercise 6.4
titanic = titanic[["Age", "Sex", "Pclass"]]
titanic["Age"]=pd.cut(titanic.Age,
                        bins=[0, 14, 50, np.inf],
                        labels=["0-13", "14-49", "50-"],
                        right=False)
d=pd.get_dummies(titanic,columns=["Age", "Sex", "Pclass"])
d.sample(7)

Combaining Data into Data Frames

In [None]:
# "pd.concat" can combine series and data frames in multiple ways
states=pd.Series(["Andhra Pradesh","Telengana","Karnataka"])
capitals=pd.Series(["Amaravathi","Hyderabab","Bengaluru"])
populations=pd.Series([32.7,267.8,15.3])

In [None]:
# We can combine all three series into a data frame
pd.concat((states,capitals,populations),axis=1)

In [None]:
df=pd.concat((states,capitals), axis=1)
df

In [None]:
pd.concat((df,populations), axis=1)

In [None]:
# "pd.concat" does not combine observations by row but by index.
countries = pd.Series(["MY", "ID", "KH"], index=range(3))
capitals = pd.Series(["Kuala Lumpur", "Jakarta", "Phnom Penh"],
                     index=["MY", "ID", "KH"])
pd.concat((countries, capitals), axis=1)

In [None]:
# reset_index method. This just replaces the former index by a new one, an integer sequence from 0 to the count of cases minus one.
pd.concat((countries,capitals.reset_index()),axis=1) # capitals.reset_index() forces the index of capitals to be 0, 1, 2, just like that for countries.    "reindex"  method to re-index the other series not by consecutive numbers but the index of the first series. 

In [None]:
# Exercise 6.5
# REFERENE VARIABLE When you convert a categorical variable (like "residence") into dummy variables, each category becomes a new column with binary 
# values (0 or 1). But to avoid redundancy and multicollinearity in models like linear regression, you drop one of those categories — this dropped 
# one is called the reference category.
males=pd.read_csv("C:/Users/user/Downloads/dataset-54234.csv")
residence = pd.get_dummies(males.residence, prefix="residence")
## remove the reference category
residence = residence.drop("residence_north_east", axis=1)
residence.sample(4)

In [None]:
# chaining-Chaining means linking multiple method calls together in a single line of code, so that each method operates on the result of the one 
# before it. It’s a clean, readable way to perform a sequence of operations without creating intermediate variables.
ethn = pd.get_dummies(males.ethn, prefix="ethn")\
         .drop("ethn_other", axis=1)
ethn.sample(4)

In [None]:
#- males.wage is still untouched, so you pull it directly.
#- residence and ethn were processed separately, so you use the new dummy DataFrames instead of the original columns.

d=pd.concat((males.wage, residence, ethn), axis=1)
d.sample(7)