### Using Template Data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

boat = sns.load_dataset('titanic')
boat.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
boat.info()

boat.describe()

boat.columns

boat.to_csv('./dataset/titanic.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


### Using Own Data

In [11]:
# Method - 1

df = pd.read_excel("./dataset/Startup_data.xlsx")
df.head()
df.tail()
df.describe()
df.columns

Index(['Startup name', 'Active (opinion by CMs)', 'Last edited by', 'Location',
       'HQ', 'Founded date', 'Industry (Vertical)', 'Horizontal',
       'Operating status', 'Funding status', 'Employees', 'Founder 1',
       'Founder 2', 'Founder 3', 'Founder 4', 'Founder 5', 'Unnamed: 16',
       'Website', 'Social media URLs', 'Email', 'Phone',
       'Funding round 1 (Investment)', 'Founder round 1 investors',
       'Funding round 1 date', 'Funding round 2 (Investment)',
       'Founder round 2 investors', 'Funding round 2 date',
       'Funding round 3 (Investment)', 'Founder round 3 investors',
       'Funding round 3 date', 'Funding round 4 (Investment)',
       'Founder round 4 investors', 'Funding round 4 date',
       'Funding round 5 (Investment)', 'Founder round 5 investors',
       'Funding round 5 date'],
      dtype='object')

In [13]:
# Method - 2

df = sns.load_dataset('titanic')

# Reversing Row Order

df.loc[::-1].head()
df.loc[::-1].reset_index(drop=True).head() # Now las row has index 0

# Reversing Column Order

df.loc[:, ::-1].head()

df.dtypes # Shows data types present in dataset

# Selecting Columns Based on Data types

df.select_dtypes(include=['object']).head() # Include specific datatype
df.select_dtypes(exclude=['object']).head() # Exclude specific datatype

# Converting String into numeric

df1 = pd.DataFrame({'col_1':["1","2","3"],'col_2':["4","5","6"]})

# Method - 1

df1.astype({'col_1':'int','col_2':'int'}).dtypes

# Method - 2

pd.to_numeric(df1['col_1'],errors='coerce')

0    1
1    2
2    3
Name: col_1, dtype: int64

### Taking Sample form large Dataset

In [18]:
df2 = sns.load_dataset('titanic')
# df2.info()

df2.sample(frac=0.1).info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 162 to 544
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     89 non-null     int64   
 1   pclass       89 non-null     int64   
 2   sex          89 non-null     object  
 3   age          70 non-null     float64 
 4   sibsp        89 non-null     int64   
 5   parch        89 non-null     int64   
 6   fare         89 non-null     float64 
 7   embarked     88 non-null     object  
 8   class        89 non-null     category
 9   who          89 non-null     object  
 10  adult_male   89 non-null     bool    
 11  deck         24 non-null     category
 12  embark_town  88 non-null     object  
 13  alive        89 non-null     object  
 14  alone        89 non-null     bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 9.2+ KB


### Splitting Dataset

In [45]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [24]:
len(df)
df.shape

(891, 15)

In [26]:
df1 = df.sample(frac=0.1, random_state=1)
df1.shape

(89, 15)

In [29]:
df2 = df.drop(df1.index)
df2.shape

(802, 15)

In [31]:
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
862,1,1,female,48.0,0,0,25.9292,S,First,woman,False,D,Southampton,yes,True
223,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
84,1,2,female,17.0,0,0,10.5,S,Second,woman,False,,Southampton,yes,True
680,0,3,female,,0,0,8.1375,Q,Third,woman,False,,Queenstown,no,True
535,1,2,female,7.0,0,2,26.25,S,Second,child,False,,Southampton,yes,False


In [32]:
df2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


### Rejoining the Dataset

In [35]:
df3 = pd.concat([df1,df2])

### Filtering Dataset

In [46]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [49]:
df.sex.unique() # Shows Unique Data
df[(df.sex=="female")] # SHowing only female's data
df[((df.embark_town=="Southampton") | (df.embark_town=="Cherbourg")) & (df.sex=="female")]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [50]:
df[df.embark_town.isin(['Southampton', 'Queenstown'])].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True


### Filtering by large categories

In [51]:
df.age.value_counts().nlargest(3) # Shows top 3 largest values

age
24.0    30
22.0    27
18.0    26
Name: count, dtype: int64

### Splitting a string into multiple columns

In [53]:
import pandas as pd

df = pd.DataFrame({'Name' : ['Micheal John', 'Aurelia Nobels', 'Scarlett Johnson', 'Elizebeth Olsan'],
                   'Location' : ['NewYork, US', 'London, UK', 'Paris, France', 'Berlin, Germany']})
df

Unnamed: 0,Name,Location
0,Micheal John,"NewYork, US"
1,Aurelia Nobels,"London, UK"
2,Scarlett Johnson,"Paris, France"
3,Elizebeth Olsan,"Berlin, Germany"


In [55]:
df[['First_Name','Last_Name']] = df.Name.str.split(' ', expand=True)
df[['City', 'Country']] = df.Location.str.split(',',expand=True)
df

Unnamed: 0,Name,Location,First_Name,Last_Name,City,Country
0,Micheal John,"NewYork, US",Micheal,John,NewYork,US
1,Aurelia Nobels,"London, UK",Aurelia,Nobels,London,UK
2,Scarlett Johnson,"Paris, France",Scarlett,Johnson,Paris,France
3,Elizebeth Olsan,"Berlin, Germany",Elizebeth,Olsan,Berlin,Germany


In [56]:
df = df[['First_Name','Last_Name','City','Country']]  # Refining the data
df 

Unnamed: 0,First_Name,Last_Name,City,Country
0,Micheal,John,NewYork,US
1,Aurelia,Nobels,London,UK
2,Scarlett,Johnson,Paris,France
3,Elizebeth,Olsan,Berlin,Germany
