In [None]:
import pandas as pd
df=pd.read_csv("/content/train.csv")
print(df)
df.shape

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fe

(1460, 81)

In [None]:
df.dropna() #->by default it drops the row that contains null i.e nan
#df.dropna(axis=1)->removes columns that contains null
#df.dropna(inplace=True,axis=1) ->this will affect the original dataset by column wise

In [None]:
import numpy as np
data = {
    'Name': ['John', 'Alice', 'Bob', 'Charlie', None],
    'Age': [28, np.nan, 22, 35, 40],
    'Gender': ['Male', 'Female', 'Male', np.nan, 'Female'],
    'Salary': [50000, 62000, 30000, 58000, 52000],

}

df = pd.DataFrame(data)
print(df)

      Name   Age  Gender  Salary
0     John  28.0    Male   50000
1    Alice   NaN  Female   62000
2      Bob  22.0    Male   30000
3  Charlie  35.0     NaN   58000
4     None  40.0  Female   52000


In [None]:
df.dropna()

Unnamed: 0,Name,Age,Gender,Salary
0,John,28.0,Male,50000
2,Bob,22.0,Male,30000


In [None]:
df.dropna(axis=1) #or
#df.dropna(axis='columns')

Unnamed: 0,Salary
0,50000
1,62000
2,30000
3,58000
4,52000


In [None]:
df_new = df.dropna(how='all')  #removes all rows that contains nan values
print(df_new)

      Name   Age  Gender  Salary
0     John  28.0    Male   50000
1    Alice   NaN  Female   62000
2      Bob  22.0    Male   30000
3  Charlie  35.0     NaN   58000
4     None  40.0  Female   52000


### **Drop all none values based on specific columns**

In [None]:
data = {
    'col_1': [10, None, 30, None, 50],
    'col_2': [100, 200, 300, 400, 500]
}

df = pd.DataFrame(data)
print(df)

   col_1  col_2
0   10.0    100
1    NaN    200
2   30.0    300
3    NaN    400
4   50.0    500


In [None]:
#df['col_1']->takes one column finds if that column contains non null values
#returns false if it has nan values otherwise true ->which is called boolean masks
#use this boolean masks and filters the rows to return a new data frames
df_new=df[df['col_1'].notnull()]
print(df_new)

   col_1  col_2
0   10.0    100
2   30.0    300
4   50.0    500


In [None]:
df.dropna(subset=['col_1'], how='all', inplace=True)
print(df)

   col_1  col_2
0   10.0    100
2   30.0    300
4   50.0    500


### **Drop duplicates with multiple columns**

In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'David', 'Eve'],
    'Age': [28, 34, 28, 25, 34, 40, 30],
    'Job': ['Engineer', 'Doctor', 'Engineer', 'Artist', 'Doctor', 'Manager', 'Nurse'],
    'Location': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles', 'San Francisco', 'Boston']
}

df = pd.DataFrame(data)
print(df)

      Name  Age       Job       Location
0    Alice   28  Engineer       New York
1      Bob   34    Doctor    Los Angeles
2    Alice   28  Engineer       New York
3  Charlie   25    Artist        Chicago
4      Bob   34    Doctor    Los Angeles
5    David   40   Manager  San Francisco
6      Eve   30     Nurse         Boston


In [None]:
df_duplicate=df.drop_duplicates(['Name','Age'],keep='last')
print(df_duplicate)
#keep='last'->keeps last occurance of each duplicate
#keep='first'->keeps first occurance of each duplicate
#keep=False->removes all duplicates

      Name  Age       Job       Location
2    Alice   28  Engineer       New York
3  Charlie   25    Artist        Chicago
4      Bob   34    Doctor    Los Angeles
5    David   40   Manager  San Francisco
6      Eve   30     Nurse         Boston


### **Iterating in Dataframe**

*   df.iterrows() function in pandas is used to iterate over the rows of a DataFrame
*   Usually it retruns the index and row



In [None]:
for index,row in df.iterrows():
  #cur_name=row['Name']
  #cur_age=row['Age']
  #print(cur_name,cur_age)
  print(f"Cur_name:{row['Name']},\t\tCur_age:{row['Age']}")


Cur_name:Alice,		Cur_age:28
Cur_name:Bob,		Cur_age:34
Cur_name:Alice,		Cur_age:28
Cur_name:Charlie,		Cur_age:25
Cur_name:Bob,		Cur_age:34
Cur_name:David,		Cur_age:40
Cur_name:Eve,		Cur_age:30


### **Coverting pandas dataframe into dictionary**

In [None]:
dict_stats=dict(zip(df.Name,df.Age))
print(dict_stats)

{'Alice': 28, 'Bob': 34, 'Charlie': 25, 'David': 40, 'Eve': 30}


### **Converting list of list into dataframe**

In [None]:
lst_pets = [['dog', 1], ['cat', 2], ['fish', 3]]
df_pets=pd.DataFrame(lst_pets,columns=['pets','amount'])
print(df_pets)

   pets  amount
0   dog       1
1   cat       2
2  fish       3


**Renaming column names**

*   dataframe_var.rename(columns={'old_column_name':'new_column_name'....},inplace=True)




In [None]:
df_pets.rename(columns={'pets':'pet_type','amount':'amnt'},inplace=True)
print(df_pets)

  pet_type  amnt
0      dog     1
1      cat     2
2     fish     3


In [None]:
#def to_minutes(row):
  #  return row["amnt"] * 2

# Create new column
#df_pets['newcolumn'] = df_pets.apply(to_minutes, axis=1)
#print(df_pets)

#dataframe_var['column_name']-->used mostly
df_pets['newcolumn2'] = df_pets['amnt'] * 3
print(df_pets)

  pet_type  amnt  newcolumn  newcolumn2
0      dog     1          2           3
1      cat     2          4           6
2     fish     3          6           9


## **Multiple if condition**

In [None]:
if (df_pets['amnt'] == 1).any() and (df_pets['pet_type'] == "dog").any(): # conditional "and"
  print("10")
elif (df_pets['amnt'] == 1).any() or (df_pets['amnt']).any() == 5: # conditional "or"
  print("either 1 or 5")
else:
  print("else")

10


In [None]:
# only those rows that have pet_type either "dog" or "cat"
df_dog_cat = df_pets[df_pets["pet_type"].isin(["dog", "cat"])]
print(df_dog_cat)

  pet_type  amnt  newcolumn  newcolumn2
0      dog     1          2           3
1      cat     2          4           6


### **Return index of first occurance of string**

In [None]:
str_message = 'Life will be truly wonderful on a wonderful planet.'
print(str_message)

Life will be truly wonderful on a wonderful planet.


In [None]:
#returns index of substring from left side
print(str_message.find('wonderful'))

19


In [None]:
#return index of substring from right side
print(str_message.rfind('wonderful'))

34


In [None]:
# find index of "wonder" after index 20;
print(str_message.find('wonder', 20))

34



### **Random number**

In [None]:
import random
random_num =random.random() #generates a random value between 0 and 1
print(random_num)

0.48854240210769595


**Random numbers generation bewtween range**

In [None]:
import random
rand_range_float=random.uniform(1,10) #uniform function generates float number between range
print("Random float number : ",rand_range_float)

#includes both start and end point
rand_range_int=random.randint(1,10) #randint function generates whole number between range
print("Random int number : ",rand_range_int)

Random float number :  5.796454314449198
Random int number :  9


In [None]:
from random import randrange
print(randrange(10))
print(randrange(0,10,2)) #helps to specify to specify step size

8
2


# **Concatenating a bunch of string**

In [None]:
import pandas as pd
data=[{"name":"John","email":"John@gmail.com","affiliation": "University A", "coauthor_names": "Jane Smith", "research": "AI"}]
df=pd.DataFrame(data)
print(df)

   name           email   affiliation coauthor_names research
0  John  John@gmail.com  University A     Jane Smith       AI


In [None]:
name = data[0]['name']
email = data[0]['email']
affiliation = data[0]['affiliation']
coauthors_names = data[0]['coauthor_names']
research = data[0]['research']
curr_line = f"{name},{email},{affiliation},{coauthors_names},{research}"
print(curr_line)

John,John@gmail.com,University A,Jane Smith,AI


# **Extracting data using iloc and loc**

In [None]:
#used to locate values using index
arr_num=df.iloc[:,0:2] #[rows,columns]
print(arr_num) # selects all rows and column from 0 to 1

   name           email
0  John  John@gmail.com


In [None]:
#get the data by their names
arr_loc=df.loc[:,['name','email']]
print(arr_loc)

   name           email
0  John  John@gmail.com


## **Exception**

*   allows you to get detailed information about where the error happened, including the file name, line number, and the code that was executing at the time of the error




In [None]:
import traceback
path_file="/content/dummy.json"
try:
  f=open(path_file,'r')
except:
  traceback.print_exc()
finally:
  print("Successfull")
  f.close()

Successfull
