In [59]:
import pandas as pd
import numpy as np

data=np.random.randn(50,3)
null_index= np.random.choice([True,False],size=(50,3),p=[0.10,0.90])
data[null_index]=np.nan
df=pd.DataFrame(data,columns=["First column","Second column","Third column"])
df

Unnamed: 0,First column,Second column,Third column
0,,,1.04924
1,-1.086478,-0.244348,-1.715656
2,0.784646,,0.730712
3,,-0.014691,0.963216
4,-1.680882,0.56488,0.025495
5,,0.491886,0.524444
6,-0.438535,-2e-06,-1.074958
7,-1.14252,1.00175,-0.907417
8,,-1.541189,1.780875
9,1.610021,,-0.440238


In [60]:
# part a: Identify and count missing values in a dataframe
df1=df.isnull().sum()
df1

First column     10
Second column    10
Third column      2
dtype: int64

In [61]:
# part b: Drop the column having more than 5 null values
thresh_val=df.shape[0] - 5
df.dropna(axis=1, thresh=thresh_val)

Unnamed: 0,Third column
0,1.04924
1,-1.715656
2,0.730712
3,0.963216
4,0.025495
5,0.524444
6,-1.074958
7,-0.907417
8,1.780875
9,-0.440238


In [62]:
# part c: the row label having maximum of the sum of all values in a row and drop that row
a=df.sum(axis=1).idxmax()
df.drop(index=a)

Unnamed: 0,First column,Second column,Third column
0,,,1.04924
1,-1.086478,-0.244348,-1.715656
2,0.784646,,0.730712
3,,-0.014691,0.963216
4,-1.680882,0.56488,0.025495
5,,0.491886,0.524444
6,-0.438535,-2e-06,-1.074958
7,-1.14252,1.00175,-0.907417
8,,-1.541189,1.780875
9,1.610021,,-0.440238


In [63]:
# part d: Sorting the dataframe on the basis of the first column
df.sort_values('First column', ignore_index=True)

Unnamed: 0,First column,Second column,Third column
0,-2.556021,0.349167,-0.533186
1,-1.803621,-2.454738,0.720025
2,-1.793473,0.80272,-1.344382
3,-1.680882,0.56488,0.025495
4,-1.551419,0.324804,-0.639828
5,-1.169954,,-0.401306
6,-1.14252,1.00175,-0.907417
7,-1.086478,-0.244348,-1.715656
8,-1.012329,0.185797,-0.587358
9,-0.957422,-0.972136,0.417549


In [64]:
# part e: Removing all duplicates from the first column
df.drop_duplicates('First column')

Unnamed: 0,First column,Second column,Third column
0,,,1.04924
1,-1.086478,-0.244348,-1.715656
2,0.784646,,0.730712
4,-1.680882,0.56488,0.025495
6,-0.438535,-2e-06,-1.074958
7,-1.14252,1.00175,-0.907417
9,1.610021,,-0.440238
11,0.198145,-0.141698,0.516393
12,1.333085,-0.430349,-0.541828
13,2.586281,-1.081418,-1.495401


In [65]:
# part f: Finding correlation between first and second column and covariance between second and third column

In [66]:
df['First column'].corr(df['Second column'])

-0.09560670752718779

In [67]:
df['Second column'].cov(df['Third column'])

-0.3842857140042494

In [68]:
# part g: Detecting outliers and removing the rows having outliers
outlier=pd.Series(data=False,index=df.index)
for col in df.columns: 
        
        Q1= df[col].quantile(0.25)
        Q3= df[col].quantile(0.75)
        IQR=Q3-Q1
    
        lower_bound = Q1-(1.5 * IQR)
        upper_bound = Q3+(1.5 * IQR)
        outlier |= (df[col] < lower_bound) | (df[col] > upper_bound)
df=df[~outlier]
df

Unnamed: 0,First column,Second column,Third column
0,,,1.04924
1,-1.086478,-0.244348,-1.715656
2,0.784646,,0.730712
3,,-0.014691,0.963216
4,-1.680882,0.56488,0.025495
5,,0.491886,0.524444
6,-0.438535,-2e-06,-1.074958
7,-1.14252,1.00175,-0.907417
8,,-1.541189,1.780875
9,1.610021,,-0.440238


In [84]:
# part h: Discretizing second column and create 5 bins
df['Binning']=pd.cut(np.array(df['Second column']),bins=5)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Binning']=pd.cut(np.array(df['Second column']),bins=5)


Unnamed: 0,First column,Second column,Third column,Second Column,Binning
0,,,1.04924,,
1,-1.086478,-0.244348,-1.715656,"(-0.644, 0.286]","(-0.644, 0.286]"
2,0.784646,,0.730712,,
3,,-0.014691,0.963216,"(-0.644, 0.286]","(-0.644, 0.286]"
4,-1.680882,0.56488,0.025495,"(0.286, 1.216]","(0.286, 1.216]"
5,,0.491886,0.524444,"(0.286, 1.216]","(0.286, 1.216]"
6,-0.438535,-2e-06,-1.074958,"(-0.644, 0.286]","(-0.644, 0.286]"
7,-1.14252,1.00175,-0.907417,"(0.286, 1.216]","(0.286, 1.216]"
8,,-1.541189,1.780875,"(-1.574, -0.644]","(-1.574, -0.644]"
9,1.610021,,-0.440238,,


In [85]:
pd.cut(np.array(df['Second column']),bins=5)

[NaN, (-0.644, 0.286], NaN, (-0.644, 0.286], (0.286, 1.216], ..., (-2.508, -1.574], (0.286, 1.216], (1.216, 2.146], NaN, (-1.574, -0.644]]
Length: 49
Categories (5, interval[float64, right]): [(-2.508, -1.574] < (-1.574, -0.644] < (-0.644, 0.286] < (0.286, 1.216] < (1.216, 2.146]]