In [121]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('iris.csv')

In [3]:
# creating missing values
df['petal.length'][0] = None
df['petal.length'][5] = None
df['petal.length'][2] = None
df['petal.length'][3] = None
df['petal.length'][10] = None
df['petal.length'][20] = None
df['petal.length'][15] = None

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['petal.length'][0] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['petal.length'][0] = None
You a

In [4]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,,0.2,Setosa
3,4.6,3.1,,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [5]:
total_null = sum(df['petal.length'].isnull())
print(total_null)
total_length = df.shape[0]
missing_percentage = round((total_null/total_length)*100)
print("% of null in petal length: ",missing_percentage,"%")

7
% of null in petal length:  5 %


In [6]:
#check if < 10
mean = round(df['petal.length'].mean(),1)
if missing_percentage < 10:
    df['petal.length'] = df['petal.length'].fillna(mean)

In [7]:
df['variety'][0] = None # making the starting variable None
mode = df['variety'].mode()[0]
df['variety'] = df['variety'].fillna(mode) # filling it with mode value
df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['variety'][0] = None # making the starting variable None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,3.9,0.2,Versicolor
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,3.9,0.2,Setosa
3,4.6,3.1,3.9,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [8]:
df = pd.read_csv('iris.csv') # getting things back to normal
val_df = df.drop("variety",axis=1)

In [9]:
val_df['sepal.length'][0] = None
val_df['sepal.length'][4] = None

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  val_df['sepal.length'][0] = None
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series

In [10]:
val_df # Do not keep any catagorical data when u do this

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [11]:
imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(val_df) # create the imputer and fit the data frame

In [12]:
new_df = pd.DataFrame(imputed_data) # u can view the iputed data directly but for better understanding its better to change it also to dataframe
new_df

Unnamed: 0,0,1,2,3
0,5.300000,3.5,1.4,0.2
1,4.900000,3.0,1.4,0.2
2,4.700000,3.2,1.3,0.2
3,4.600000,3.1,1.5,0.2
4,5.233333,3.6,1.4,0.2
...,...,...,...,...
145,6.700000,3.0,5.2,2.3
146,6.300000,2.5,5.0,1.9
147,6.500000,3.0,5.2,2.0
148,6.200000,3.4,5.4,2.3


In [14]:
total_length = df.shape[0]
cols = list(df.columns)
for x in cols:
    total_null = sum(df[x].isnull())
    missing_percentage = round((total_null/total_length)*100)
    print(f"% of null in {x}: ",missing_percentage,"%")
    if missing_percentage > 10:
        df.drop(x,axis=1)

% of null in sepal.length:  0 %
% of null in sepal.width:  0 %
% of null in petal.length:  0 %
% of null in petal.width:  0 %
% of null in variety:  0 %


In [76]:
df = pd.read_csv("iris.csv")

In [77]:
df['sepal.length'][0] = 100 # creating outliers
df['sepal.length'][3] = 100
df['sepal.width'][4] = 100
df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['sepal.length'][0] = 100 # creating outliers
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sepal.lengt

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,100.0,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,100.0,3.1,1.5,0.2,Setosa
4,5.0,100.0,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [78]:
numerical_col = []
for x in cols:
    if df[x].dtype !="object":
        numerical_col.append(x)
numerical_col

['sepal.length', 'sepal.width', 'petal.length', 'petal.width']

In [79]:
def z_score_outliers(data,threshold):
    indexes = []
    mean = np.mean(data)
    std = np.std(data)
    for index, value in enumerate(data):
        z = (value-mean)/std
        if abs(z) > threshold:
            indexes.append(index)
    print(indexes)
    return indexes

def drop_indexes(arr,df):
    for x in arr:
        df.drop(x,inplace=True)
    print(df)
    df.reset_index(drop=True,inplace=True)
    print(df)

for x in numerical_col:
    indexes = z_score_outliers(df[x],3)
    drop_indexes(indexes,df)

[0, 3]
     sepal.length  sepal.width  petal.length  petal.width    variety
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
4             5.0        100.0           1.4          0.2     Setosa
5             5.4          3.9           1.7          0.4     Setosa
6             4.6          3.4           1.4          0.3     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[148 rows x 5 columns]
     sepal.length  sepal.width  petal.length  petal.width    variety
0             4.9          3.0           1.4          0.2     Setosa
1  

In [75]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,4.9,3.0,1.4,0.2,Setosa
1,4.7,3.2,1.3,0.2,Setosa
2,5.4,3.9,1.7,0.4,Setosa
3,4.6,3.4,1.4,0.3,Setosa
4,5.0,3.4,1.5,0.2,Setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Virginica
143,6.3,2.5,5.0,1.9,Virginica
144,6.5,3.0,5.2,2.0,Virginica
145,6.2,3.4,5.4,2.3,Virginica


In [86]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,4.9,3.0,1.4,0.2,Setosa
1,4.7,3.2,1.3,0.2,Setosa
2,5.4,3.9,1.7,0.4,Setosa
3,4.6,3.4,1.4,0.3,Setosa
4,5.0,3.4,1.5,0.2,Setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Virginica
143,6.3,2.5,5.0,1.9,Virginica
144,6.5,3.0,5.2,2.0,Virginica
145,6.2,3.4,5.4,2.3,Virginica


In [87]:
df['petal.length'][0]=100
df['petal.width'][0]=100
df['sepal.length'][0]=100


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['petal.length'][0]=100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['petal.length'][0]=100
You are set

In [88]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,100.0,3.0,100.0,100.0,Setosa
1,4.7,3.2,1.3,0.2,Setosa
2,5.4,3.9,1.7,0.4,Setosa
3,4.6,3.4,1.4,0.3,Setosa
4,5.0,3.4,1.5,0.2,Setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Virginica
143,6.3,2.5,5.0,1.9,Virginica
144,6.5,3.0,5.2,2.0,Virginica
145,6.2,3.4,5.4,2.3,Virginica


In [89]:
rows = df.shape[0]
for x in range(rows):
    values = []
    count = 0
    total = len(numerical_col) #4
    for y in numerical_col:
        if df[y][x] not in values:
            values.append(df[y][x])
        else:
            count +=1
    if count >=round(total/2):
        df.drop(x,inplace=True)
# after writing this shit code i remembered we have dropduplicates dunction LOL guys.. :)

In [90]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
1,4.7,3.2,1.3,0.2,Setosa
2,5.4,3.9,1.7,0.4,Setosa
3,4.6,3.4,1.4,0.3,Setosa
4,5.0,3.4,1.5,0.2,Setosa
5,4.4,2.9,1.4,0.2,Setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Virginica
143,6.3,2.5,5.0,1.9,Virginica
144,6.5,3.0,5.2,2.0,Virginica
145,6.2,3.4,5.4,2.3,Virginica


In [93]:
input_df = df.drop("variety",axis=1)
input_df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
1,4.7,3.2,1.3,0.2
2,5.4,3.9,1.7,0.4
3,4.6,3.4,1.4,0.3
4,5.0,3.4,1.5,0.2
5,4.4,2.9,1.4,0.2
...,...,...,...,...
142,6.7,3.0,5.2,2.3
143,6.3,2.5,5.0,1.9
144,6.5,3.0,5.2,2.0
145,6.2,3.4,5.4,2.3


In [95]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(input_df)
df = pd.DataFrame(df_scaled,columns=numerical_col)

In [96]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,0.111111,0.500000,0.050847,0.041667
1,0.305556,0.791667,0.118644,0.125000
2,0.083333,0.583333,0.067797,0.083333
3,0.194444,0.583333,0.084746,0.041667
4,0.027778,0.375000,0.067797,0.041667
...,...,...,...,...
141,0.666667,0.416667,0.711864,0.916667
142,0.555556,0.208333,0.677966,0.750000
143,0.611111,0.416667,0.711864,0.791667
144,0.527778,0.583333,0.745763,0.916667


In [118]:
df = pd.read_csv("iris.csv") # reseting all df changes
df
df = df.drop("variety",axis=1)

binarizer = Binarizer(threshold=0)
binary_data = binarizer.fit_transform(df)
binary_data
df = pd.DataFrame(binary_data,columns=numerical_col)
print(df)
# To people who might think WTF is use of this its just printing 1
# u use Binarizer to conver the df in such a way that if > 0  then 1, and if <=0 then 0
# now u know why all the values are one below :)..


     sepal.length  sepal.width  petal.length  petal.width
0             1.0          1.0           1.0          1.0
1             1.0          1.0           1.0          1.0
2             1.0          1.0           1.0          1.0
3             1.0          1.0           1.0          1.0
4             1.0          1.0           1.0          1.0
..            ...          ...           ...          ...
145           1.0          1.0           1.0          1.0
146           1.0          1.0           1.0          1.0
147           1.0          1.0           1.0          1.0
148           1.0          1.0           1.0          1.0
149           1.0          1.0           1.0          1.0

[150 rows x 4 columns]


In [147]:
df = pd.read_csv("fun.csv")
df

Unnamed: 0,say,do,name
0,what,nothing,remo
1,hi,bye,remo
2,what,bye,reuben
3,hi,nothing,reuben


In [150]:
encoder = OneHotEncoder()

df_encoded = encoder.fit_transform(df)

# df = pd.DataFrame(df_encoded)
df_encoded = pd.DataFrame(df_encoded.toarray(), columns=encoder.get_feature_names_out())

df_encoded
# If the ending is not much under standable its because i just wanted the data to look good you can skip some of these attributes

Unnamed: 0,say_hi,say_what,do_bye,do_nothing,name_remo,name_reuben
0,0.0,1.0,0.0,1.0,1.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,1.0
