## Pandas Series: Attributes, Methods, and Basic Statistics


In [None]:
import pandas as pd

In [None]:
series = pd.Series([4, 7, 10, 1, 9, 6, 2, 8, 5, 3]) #default indexing: 0 to size -1
series.name = "numbers"
print("data type: ", series.dtype, end = "\n")
print("column name: ", series.name, end = "\n")
print(series.index, end = "\n")
print(series.values, end = "\n")
print(series.shape, end = "\n") #series contains 10 elements
print(series.sort_values(ascending = True), end = "\n")
print(series.sort_index(ascending = False), end = "\n") #indices will be sorted in descending order
print(series.value_counts(), end = "\n") #counts the frequency of elements
print(series.size, end = "\n")
print("sum of the elements: ", series.sum(), end = "\n")
print("maximum val: ", series.max(), end = "\n")
print("minimum val: ", series.min(), end = "\n")
print("mean: ", series.mean(), end = "\n")
print("median: ", series.median(), end = "\n")
print("standard deviation: ", series.std(), end = "\n")
print("variance: ", series.var(), end = "\n")
new_indices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
series.index = new_indices
series

#Indexing in pandas

In [None]:
asc_series = pd.Series(
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    index = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
    )

In [None]:
print(asc_series.iloc[6]) #location(position) based indexing, numerical indexing
asc_series.iloc[3 : 7] #ending indexed value is not included in iloc

In [None]:
asc_series.iloc[[8, 2, 4]] #customed indices

In [None]:
print(asc_series.loc["G"]) #fetching elements based on index names, label based indexing
asc_series.loc["E" : "H"] #ending indexed value is included

In [None]:
asc_series.loc[["A", "G", "I"]]

In [None]:
try:
    asc_series.iloc[["A", "G", "I"]] #A,G,I don't indicate locations
except Exception as e:
    print(e)

In [None]:
asc_series["J" : "H" : -1] #label based slicing(ending indexed value is included)

In [None]:
asc_series[1 : 6 : 2] #numerical based slicing(ending indexed value is not included

#Series using dictionary

In [None]:
fruit_protein = {
    "Avocado" : 2.0,
    "Guava" : 2.6,
    "Blackberries" : 2.0,
    "Oranges" : 0.9,
    "Banana" : 1.1,
    "Apples" : 0.3,
    "Kiwi" : 1.5,
    "Strawberries" : 1.0,
    "Pomegranate" : 1.7,
    "Watermelon" : 0.9,
    "Mango" : 0.8,
    "Cherries" : 1.0,
    "Pineapple" : 2.8
}

In [None]:
fruit_series = pd.Series(fruit_protein, name = "fruit_protein")
fruit_series

In [None]:
print("data type: ", fruit_series.dtype, end = "\n")
print("column name: ", fruit_series.name, end = "\n")
print(fruit_series.index, end = "\n")
print(fruit_series.values, end = "\n")
print(fruit_series.shape, end = "\n") #series contains 13 elements
print(fruit_series.sort_values(ascending = True), end = "\n")
print(fruit_series.sort_index(ascending = False), end = "\n") #indices will be sorted in descending order
print(fruit_series.value_counts(), end = "\n") #counts the frequency of elements
print("Number of fruits: ", fruit_series.size, end = "\n")
print("sum of the fruit proteins: ", fruit_series.sum(), end = "\n")
print("maximum protein val: ", fruit_series.max(), end = "\n")
print("minimum protein val: ", fruit_series.min(), end = "\n")
print("mean of protein val: ", fruit_series.mean(), end = "\n")
print("median pf protein val: ", fruit_series.median(), end = "\n")
print("standard deviation of protein val: ", fruit_series.std(), end = "\n")
print("variance of protein val: ", fruit_series.var(), end = "\n")

In [None]:
fruit_series.iloc[[0, 7, 8, 9, 12]]

In [None]:
fruit_series.loc[["Avocado", "Strawberries", "Watermelon", "Pineapple", "Pomegranate"]]

#Conditional indexing

In [None]:
fruit_protein = {
    "Avocado" : 2.0,
    "Guava" : 2.6,
    "Blackberries" : 2.0,
    "Oranges" : 0.9,
    "Banana" : 1.1,
    "Apples" : 0.3,
    "Kiwi" : 1.5,
    "Strawberries" : 1.0,
    "Pomegranate" : 1.7,
    "Watermelon" : 0.9,
    "Mango" : 0.8,
    "Cherries" : 1.0,
    "Pineapple" : 2.8
}
fruit_series = pd.Series(fruit_protein, name = "fruit_protein")

In [None]:
fruit_series[(fruit_series < 0.5) | (fruit_series > 1)]

In [None]:
fruit_series[(fruit_series > 0.5) & (fruit_series <= 1.5)]

In [None]:
fruit_series[~((fruit_series >= 1.0) & (fruit_series < 3))]

#Modifying the series

In [None]:
fruit_series["Strawberries"] = 1.8
fruit_series

In [None]:
import numpy as np

In [None]:
series = pd.Series(["a", np.nan, 1, np.nan, 2])
series.notnull().sum()

#Dataframes

In [None]:
identities = {
    "Name" : ["Alice", "Bob", "Catherine", "David", "Eve", "Frederique", "Amanda"],
    "Age" : [25, 30, 38, np.nan, 29, 35, np.nan],
    "Department" : ["Customer Support", "Marketing", "Human Resource", "Sales", "Administration", "IT", "Human Resource"],
    "Salary" : [50000, 60000, 70000, 62000, np.nan, 85000, np.nan]
}

In [None]:
data_frame = pd.DataFrame(identities)
data_frame

In [None]:
data_frame.head(3)

In [None]:
data_frame.tail(2)

In [None]:
data_frame.iloc[ : 4, : 2] #first 4 rows and first 2 columns

In [None]:
data_frame.iloc[ : , : ] #all rows and all columns

In [None]:
data_frame.iloc[2 : 5, [0, 3]] #rows from 2 to 4 and columns 0 and 3

In [None]:
data_frame.iloc[[0, 2, 4], [0, 3]] #rows 0, 2, and 4, and columns 0 and 3

In [None]:
data_frame.iloc[[0, 4], 1 : 3]

In [None]:
data_frame.loc[2 : 4, ["Name", "Department", "Salary"]]

In [None]:
data_frame.loc[[1, 3, 5], ["Name", "Department", "Salary"]]

In [None]:
data_frame[["Department", "Salary"]]

In [None]:
data_frame.shape

In [None]:
data_frame.info()

In [None]:
data_frame.describe()

In [None]:
data_frame.drop([3, 4], axis = 0, inplace = False) #by default, inplace is False (It won't cause any affect to the dataframe)

In [None]:
data_frame.drop(["Age", "Salary"], axis = 1, inplace = False) #if the value assigned into inplace is True, then it will drop the specified rows or columns from the original dataframe

#Broadcasting

In [None]:
identities = {
    "Name" : ["Alice Nicole", "Bob Dylan", "Catherine Jefferson", "David Ronalds", "Eve Maxwell", "Frederique Robinson", "Amanda Daffnie"],
    "Age" : [25, 30, 38, np.nan, 29, 35, np.nan],
    "Department" : ["Customer Support", "Marketing", "Human Resource", "Sales", "Administration", "IT", "Human Resource"],
    "Salary" : [50000, 60000, 70000, 62000, np.nan, 85000, np.nan]
}

In [None]:
data_frame_modified = pd.DataFrame(identities)

In [None]:
data_frame_modified["Salary"] = data_frame_modified["Salary"] + 5000
data_frame_modified["Salary"]

#Column operations

In [None]:
data_frame_modified.rename(columns = {"Department" : "Dept"}, inplace = True)
data_frame_modified

In [None]:
data_frame_modified["Dept"].unique()

In [None]:
data_frame_modified["Dept"].value_counts()

In [None]:
data_frame_modified["Salary after Promotion"] = data_frame_modified["Salary"] * 25
data_frame_modified

#Data Cleaning

In [None]:
data_frame_modified.dropna() #eliminates the row/rows containing null values

In [None]:
data_frame_modified.dropna(axis = 0) #similar to the result of dropna()

In [None]:
data_frame_modified.dropna(how = "any") #if a row contains at least one null value, then it removes that row

In [None]:
data_frame_modified.dropna(how = "all") #if all the elements in a row is null, then it removes that row. otherwise, it won't remove that row

In [None]:
data_frame_modified.dropna(axis = 1) #eliminates the column/columns that contais/contain at least one null value

In [None]:
data_frame_modified.dropna(how = "any", axis = 1) #if a column contains at least one null value, then it removes that column

In [None]:
data_frame_modified.dropna(how = "all", axis = 1) #if all the value in a column is null, then it removes that column

In [None]:
data_frame_modified.fillna(0) #fills all the nan values with 0

In [None]:
data_frame_modified["Age"].fillna(data_frame_modified["Age"].mean()) #fills the missing value in the Age column with mean value in the Age column

In [None]:
data_frame_modified["Salary"].fillna(data_frame_modified["Salary"].median()) #fills the missing values in the Salary column with the median of the values in Salary column

In [None]:
data_frame_modified["Salary"].fillna(method = "ffill") #moves from top to bottom, and fill the missing value with the first encountered value before the missing value. if the first cell has a missing value, that value will still remain nan, as there isn't value before that to replace with.

In [None]:
data_frame_modified["Salary"].fillna(method = "bfill") #moves from bottom to top and fills the value in the missing value with the first encountered value before the missing value. if the last cell has a missing value, that value will still remain nan, as there isn't value before that to replace with.

In [None]:
data_frame_modified["Name"] = data_frame_modified["Name"].replace("Amanda", "Minerva")
data_frame_modified

#Dealing with duplicates

In [None]:
data_frame_duplicates = data_frame_modified[data_frame_modified.duplicated()]
data_frame_duplicates #curremt data_frame doesn't contain any identical rows

In [None]:
duplicated_dept = data_frame_modified[data_frame_modified["Dept"].duplicated()]
duplicated_dept #if it encounters any value encountered before, it treats the latter as duplicated value

In [None]:
duplicated_dept = data_frame_modified[data_frame_modified["Dept"].duplicated(keep = "first")]
duplicated_dept #if it encounters any value encountered before from top to bottom, it treats that value as a duplicated value

In [None]:
duplicated_dept = data_frame[data_frame_modified["Dept"].duplicated(keep = "last")]
duplicated_dept #if it encounters any value encountered before from bottom to top, it treats that value as a duplicated value

In [None]:
removed_duplicates = data_frame_modified.drop_duplicates(subset = "Dept")
removed_duplicates #removes the entire row containing the duplicated value in the department column

Lambda functions

In [None]:
data_frame_modified["Salary after Promotion"] = data_frame_modified["Salary after Promotion"].apply(lambda x : x * 1.5 if x > 1800000 else x * 2.5)
data_frame_modified

In [None]:
data_frame_modified["Salary"] = data_frame_modified["Salary"].apply(lambda x : x / 3 if x > 70000 else x / 2)
data_frame_modified

In [None]:
data_frame_modified[["Forename", "Surname"]] = data_frame_modified["Name"].str.split(" ", expand = True)
data_frame_modified = data_frame_modified.drop(["Name"], axis = 1)
data_frame_modified

In [None]:
data_frame_modified = data_frame_modified.reindex(
    columns = ["Forename", "Surname", "Age", "Dept", "Salary", "Salary after Promotion"]
)
data_frame_modified

Function calls vs Lambda

In [None]:
def multiplying_age(x):
    return x * 2
data_frame_modified["Age"] = data_frame_modified["Age"].apply(multiplying_age)
data_frame_modified

In [None]:
data_frame_modified["Age"] = data_frame_modified["Age"].apply(lambda x : x / 2)
data_frame_modified

Joins and Merges

In [None]:
department_info = {
    "Dept" : ["Customer Support", "Marketing", "Human Resource", "Sales", "Administration", "IT", "Human Resource"],
    "Location" : ["Zurich", "Frankfurt", "Linz", "Gothenburg", "Naples", "Vienna", "Edinburgh"],
    "Manager" : ["Michael Jackson", "Billy Eilish", "Lana Del Rey", "Gracie Abrams", "Zara Larsson", "John Lennon", "Sabrina Carpenter"]
}
dept_info = pd.DataFrame(department_info)
dept_info

In [None]:
pd.concat([data_frame_modified, dept_info], axis = 1) #concatenates two data frames and keeps the common column twice

In [None]:
pd.merge(data_frame_modified, dept_info, on = "Dept", how = "inner") #merges the two data frames and keeps the frequency of the common column as 1

In [None]:
pd.merge(data_frame_modified, dept_info, on = "Dept", how = "left")

In [None]:
pd.merge(data_frame_modified, dept_info, on = "Dept", how = "right")

In [None]:
pd.merge(data_frame_modified, dept_info, on = "Dept", how = "outer")

Importing files

In [None]:
data_in_excel_sheet = pd.read_excel("CSE 4303 Mid Marks.xlsx")
data_in_excel_sheet.shape
data_in_excel_sheet.info()
data_in_excel_sheet.describe()

In [None]:
data_in_excel_sheet["Student ID"] = data_in_excel_sheet["Student ID"].astype("int32")
data_in_excel_sheet.info()