In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [19]:
# Example: Creating a sample DataFrame for illustration
data = {
    'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Eva'],
    'Age': [25, 30, 35, None, 105],
    'Email': ['alice@example.com', None, 'charlie@somemail', 'na', 'eva@domain.com'],
    'JoinDate': ['2021-01-01', '2020-06-15', None, '2019-07-20', 'NA'],
    'Salary': [70000, 50000, None, 0, 150000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
      Name    Age              Email    JoinDate    Salary
0    Alice   25.0  alice@example.com  2021-01-01   70000.0
1      Bob   30.0               None  2020-06-15   50000.0
2  Charlie   35.0   charlie@somemail        None       NaN
3      NaN    NaN                 na  2019-07-20       0.0
4      Eva  105.0     eva@domain.com          NA  150000.0


Coloanele Name si Age care au NaN nu sunt relevante, pentru ca nu le vom putea nici cum recupera

In [20]:
df_cleaned = df.dropna(subset=['Name', 'Age'])
df_cleaned

Unnamed: 0,Name,Age,Email,JoinDate,Salary
0,Alice,25.0,alice@example.com,2021-01-01,70000.0
1,Bob,30.0,,2020-06-15,50000.0
2,Charlie,35.0,charlie@somemail,,
4,Eva,105.0,eva@domain.com,,150000.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      4 non-null      object 
 1   Age       4 non-null      float64
 2   Email     4 non-null      object 
 3   JoinDate  4 non-null      object 
 4   Salary    4 non-null      float64
dtypes: float64(2), object(3)
memory usage: 328.0+ bytes


In [22]:
df['JoinDate'].value_counts()

JoinDate
2021-01-01    1
2020-06-15    1
2019-07-20    1
NA            1
Name: count, dtype: int64

In [23]:
df_cleaned['JoinDate'].replace('NA', np.nan, inplace=True)
df_cleaned

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['JoinDate'].replace('NA', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['JoinDate'].replace('NA', np.nan, inplace=True)


Unnamed: 0,Name,Age,Email,JoinDate,Salary
0,Alice,25.0,alice@example.com,2021-01-01,70000.0
1,Bob,30.0,,2020-06-15,50000.0
2,Charlie,35.0,charlie@somemail,,
4,Eva,105.0,eva@domain.com,,150000.0


In [24]:
df.isnull().sum()

Name        1
Age         1
Email       1
JoinDate    1
Salary      1
dtype: int64

In [25]:
df_cleaned['Joined'] = df_cleaned['JoinDate'].notnull()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Joined'] = df_cleaned['JoinDate'].notnull()


In [27]:
df_cleaned = df_cleaned[df_cleaned['Age']<60]
df_cleaned

Unnamed: 0,Name,Age,Email,JoinDate,Salary,Joined
0,Alice,25.0,alice@example.com,2021-01-01,70000.0,True
1,Bob,30.0,,2020-06-15,50000.0,True
2,Charlie,35.0,charlie@somemail,,,False


In [28]:
df_cleaned.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop_duplicates(inplace=True)


In [29]:
df_cleaned

Unnamed: 0,Name,Age,Email,JoinDate,Salary,Joined
0,Alice,25.0,alice@example.com,2021-01-01,70000.0,True
1,Bob,30.0,,2020-06-15,50000.0,True
2,Charlie,35.0,charlie@somemail,,,False


In [30]:
df_cleaned['JoinDate'] = pd.to_datetime(df_cleaned['JoinDate'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['JoinDate'] = pd.to_datetime(df_cleaned['JoinDate'], errors='coerce')


In [31]:
df_cleaned

Unnamed: 0,Name,Age,Email,JoinDate,Salary,Joined
0,Alice,25.0,alice@example.com,2021-01-01,70000.0,True
1,Bob,30.0,,2020-06-15,50000.0,True
2,Charlie,35.0,charlie@somemail,NaT,,False


In [32]:
df_cleaned['Email'].fillna('unknow@unknow.com', inplace=True)
df_cleaned

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Email'].fillna('unknow@unknow.com', inplace=True)


Unnamed: 0,Name,Age,Email,JoinDate,Salary,Joined
0,Alice,25.0,alice@example.com,2021-01-01,70000.0,True
1,Bob,30.0,unknow@unknow.com,2020-06-15,50000.0,True
2,Charlie,35.0,charlie@somemail,NaT,,False


In [40]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Name      3 non-null      object        
 1   Age       3 non-null      float64       
 2   Email     3 non-null      object        
 3   JoinDate  2 non-null      datetime64[ns]
 4   Salary    2 non-null      float64       
 5   Joined    3 non-null      bool          
dtypes: bool(1), datetime64[ns](1), float64(2), object(2)
memory usage: 255.0+ bytes
