# PANDAS

In [4]:
import pandas as pd
mydataset = {"cars":["BMW", "Volvo", "Ford"], "passings":[3, 7, 2]}
myvar = pd.DataFrame(mydataset)
print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [6]:
print(pd.__version__)

2.2.2


## Pandas Series
#### A Pandas Series is like a column in a table.

In [10]:
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


### Labels
#### If nothing else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.This label can be used to access a specified value.

In [13]:
# Return the first value of the Series:
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar[0])

1


### Create Labels
#### With the index argument, you can name your own labels.

In [16]:
# Create our own labels:
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

x    1
y    7
z    2
dtype: int64


#### When we have created labels, we can access an item by referring to the label.

In [21]:
# Return the value of "y":
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar["y"])

7


### Key/Value Objects as Series

In [27]:
# Create a simple Pandas Series from a dictionary:
import pandas as pd
calories = {"day1":420, "day2":380, "day3":390}
myvar = pd.Series(calories)
print(myvar)

day1    420
day2    380
day3    390
dtype: int64


In [29]:
# Create a Series using only data from "day1" and "day2":
import pandas as  pd
calories = {"day1":420, "day2":380, "day3":390}
myvar = pd.Series(calories, index = ["day1", "day2"])
print(myvar)

day1    420
day2    380
dtype: int64


# DataFrames
#### A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns. Data sets in Pandas are usually mulyi-dimensional tables, called DataFrames.
#### A DataFrame is the whole table.

In [32]:
# Create a DataFrame from two Series:
import pandas as pd
data = {"calories": [420, 380, 390],
        "duration": [50, 40, 45]}
myvar = pd.DataFrame(data)
print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


### Locate Row
##### Pandas use the loc attribute to return one or more specified row(s)

In [42]:
import pandas as pd
data = {"calories": [420, 380, 390],
        "duration": [50, 40, 45]}
df = pd.DataFrame(data)
print(df)

# refer to the row index:
print(df.loc[0])

   calories  duration
0       420        50
1       380        40
2       390        45
calories    420
duration     50
Name: 0, dtype: int64


In [48]:
# use a list of indexes:
print(df.loc[[0,1]])

   calories  duration
0       420        50
1       380        40


### Named Indexes
##### With the index argument, we can name your own indexes.

In [52]:
# Add a list of names to give each row a name:
import pandas as pd
data = {"calories": [420, 380, 390],
        "duration": [50, 40, 45]}
df = pd.DataFrame(data, index = ["Day1", "Day2", "Day3"])
print(df)

      calories  duration
Day1       420        50
Day2       380        40
Day3       390        45


### Locate Named Indexes

In [55]:
#refer to the named index:
print(df.loc["Day2"])  # return "day2"

calories    380
duration     40
Name: Day2, dtype: int64


## Read CSV
### Load Files Into a DataFrame

In [79]:
#Load the CSV into a DataFrame
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Downloads\Dataset for Pandas.csv")
#print(df.to_string())   #  use to_string() to print the entire DataFrame.

In [66]:
# without to_string()
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Downloads\Dataset for Pandas.csv")
print(df)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

### max_rows
#### We can check your system's maximum rows with the pd.options.display.max_rows statement.

In [69]:
import pandas as pd
print(pd.options.display.max_rows)

60


##### In my system the number is 60, which means that if the DataFrame contains more than 60 rows, the print(df) statement will return only the headers and the first and last 5 rows. We can change the maximum rows number with the same statement.

In [81]:
# Increase the maximum number of rows to display the entire DataFrame:
import pandas as pd
pd.options.display.max_rows = 9999
df = pd.read_csv(r"C:\Users\user\Downloads\Dataset for Pandas.csv")
#print(df)

## Analyzing DataFrames

### Viewing the Data
#### The head() method returns the headers and a specified number of rows, starting from the top.

In [77]:
# Get a quick overview by priting the first 10 rows of the DtaFrame:
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Downloads\Dataset for Pandas.csv")
print(df.head(10))

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   
5          897       3                    Svensson, Mr. Johan Cervin    male   
6          898       3                          Connolly, Miss. Kate  female   
7          899       2                  Caldwell, Mr. Albert Francis    male   
8          900       3     Abrahim, Mrs. Joseph (Sophie Halaut Easu)  female   
9          901       3                       Davies, Mr. John Samuel    male   

    Age  SibSp  Parch     Ticket     Fare Cabin Embarked  
0  34.5      0      0     330911   7.8292   NaN        Q  
1

In [83]:
# Print the first 5 rows of the DataFrame:
print(df.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


#### There is also a tail() method for viewing the last rows of the DataFrame.

In [86]:
# Print the last 5 rows of the DataFrame:
print(df.tail()) 

     PassengerId  Pclass                          Name     Sex   Age  SibSp  \
413         1305       3            Spector, Mr. Woolf    male   NaN      0   
414         1306       1  Oliva y Ocana, Dona. Fermina  female  39.0      0   
415         1307       3  Saether, Mr. Simon Sivertsen    male  38.5      0   
416         1308       3           Ware, Mr. Frederick    male   NaN      0   
417         1309       3      Peter, Master. Michael J    male   NaN      1   

     Parch              Ticket      Fare Cabin Embarked  
413      0           A.5. 3236    8.0500   NaN        S  
414      0            PC 17758  108.9000  C105        C  
415      0  SOTON/O.Q. 3101262    7.2500   NaN        S  
416      0              359309    8.0500   NaN        S  
417      1                2668   22.3583   NaN        C  


## Info About the Data

In [89]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


# Cleaning Data

## Cleaning Empty Cells

In [93]:
# Return a new Data Frame with no empty cells:
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Downloads\Dataset for Pandas.csv")
new_df = df.dropna()
print(new_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


In [95]:
# Remove all rows with NULL values withouting creating a new dataset:
df.dropna(inplace =  True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


## Replace Empty Values
##### The fillna() method allows us to replace empty cells with a value:

In [108]:
# Replace NULL values with the number 0:
df.fillna(0, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


### Replace Using Mean, Median, or Mode

In [117]:
# Calculate the MEAN, and replace any empty values with it:
mean = df["Age"].mean()
df.fillna({"Age": mean}, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


In [119]:
# Calculate the MEDIAN, and replace any empty values with it:
median = df["Age"].median()
df.fillna({"Age": median}, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


In [129]:
# Calculate the MODE, and replace any empty values with it:
x = df["Age"].mode()[0]
df.fillna({"Age": x}, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB
None


In [131]:
# Checking null values
pd.isnull(df).sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Discovering Duplicates

In [138]:
print(df.duplicated("PassengerId").sum())

0


### Removing Duplicates

In [142]:
df.drop_duplicates(inplace =  True)

## Statistical Tools

In [148]:
df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,87.0,87.0,87.0,87.0,87.0,87.0
mean,1102.712644,1.137931,39.247126,0.597701,0.482759,98.109198
std,126.751901,0.435954,15.21873,0.637214,0.860801,88.177319
min,904.0,1.0,1.0,0.0,0.0,0.0
25%,986.0,1.0,27.0,0.0,0.0,35.3396
50%,1094.0,1.0,39.0,1.0,0.0,71.2833
75%,1216.0,1.0,50.0,1.0,1.0,135.06665
max,1306.0,3.0,76.0,3.0,4.0,512.3292
