In [1]:
import pandas as pd
import numpy as np

#### Reading data

In [2]:
df_sales = pd.read_csv('sales.csv')
df_sales.head()

Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,,
1,2023-01-04,6727.63,P003,28.0,
2,2023-02-11,8208.54,P002,36.0,New York
3,2023-02-03,7094.52,,26.0,Houston
4,2023-04-21,6160.3,p001,8.0,Houston


In [3]:
df_weather = pd.read_csv('weather.csv')
df_weather.head()

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,hot,,fast,Chicago
1,2023-01-04,24.4,56.0,15.37,
2,2023-02-11,38.8,70.0,0.87,
3,2023-02-03,33.3,63.0,19.89,
4,2023-04-21,30.0,43.0,9.4,New York


#### Cleaning and Transforming

In [4]:
df_sales.shape

(152, 5)

#### replacing str values like 'High' from 'sales_amount' column with median and changing column's name to 'sales'

In [5]:
df_sales['sales_amount'].dtype

dtype('O')

In [6]:
df_sales['sales_amount'].unique()

array(['high', '6727.63', '8208.54', '7094.52', '6160.3', '2156.5',
       '8300.84', '8385.76', '6633.46', '8383.84', '6863.36', '2860.16',
       '3465.65', '2931.3', '4395.33', '1350.67', '6564.28', '4028.99',
       '6901.5', '4468.57', '7134.52', '4065.62', '3346.25', '5464.34',
       '7236.01', '4135.03', '9429.83', '1352.68', '4761.51', '9708.22',
       '5931.75', '4811.24', '6116.68', '6183.32', '7584.83', '2149.21',
       '3250.15', '6224.9', '8804.05', '6056.8', '3147.37', '7118.6',
       '7659.18', '3144.13', '4399.56', '5808.95', '5469.05', '4506.56',
       '3678.72', '1899.86', '1481.37', '9626.87', '8624.29', '4194.15',
       '9611.21', '7090.93', '5342.69', '5437.23', '1749.56', '1825.34',
       '6421.97', '5983.33', '2914.55', '9515.75', '8031.66', '2021.18',
       '9378.36', '9768.23', '9963.38', '1502.84', '7633.32', '5913.24',
       '7352.48', '9717.87', '7192.27', '8532.67', '8801.82', '8546.33',
       '4834.82', '3003.19', '4569.86', '9027.07', '2319.44',

In [7]:
df_sales['sales_amount'].replace('high', 0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sales['sales_amount'].replace('high', 0, inplace=True)


In [8]:
df_sales['sales_amount'].replace(0, df_sales['sales_amount'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sales['sales_amount'].replace(0, df_sales['sales_amount'].median(), inplace=True)


In [9]:
df_sales['sales'] = pd.to_numeric(df_sales['sales_amount'])

In [10]:
df_sales = df_sales.drop('sales_amount', axis=1)

#### renaming 'units_sold' to 'quantity' and 'store_location' to 'city'

In [11]:
df_sales['units_sold'].dtype

dtype('float64')

In [24]:
df_sales.rename(columns={'units_sold' : 'quantity', 'store_location' : 'city'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sales.rename(columns={'units_sold' : 'quantity', 'store_location' : 'city'}, inplace=True)


In [25]:
print(df_sales)

          date product_id  quantity      city    sales     month
0          NaT       P003       NaN       NaN  5389.96       NaN
1   2023-01-04       P003      28.0       NaN  6727.63   January
2   2023-02-11       P002      36.0  New York  8208.54  February
3   2023-02-03        NaN      26.0   Houston  7094.52  February
4   2023-04-21       p001       8.0   Houston  6160.30     April
..         ...        ...       ...       ...      ...       ...
145 2023-02-01        NaN      24.0   Chicago  9628.32  February
146 2023-03-22       P002       5.0       NaN  1520.78     March
147 2023-04-20       p001      34.0   Houston  4550.70     April
148 2023-01-31       p001       6.0   Houston  1960.80   January
149 2023-02-16       P003       2.0       NaN  4020.94  February

[150 rows x 6 columns]


#### cleaning 'date' column and creating a new 'month' column from 'date'

In [26]:
df_sales['date'] = pd.to_datetime(df_sales['date'], format='%Y-%m-%d', errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sales['date'] = pd.to_datetime(df_sales['date'], format='%Y-%m-%d', errors='coerce')


In [27]:
# df_sales['date'] = df_sales['date'].dt.strftime('%d/%m/%Y')

In [28]:
df_sales['date'].isna().sum() 

np.int64(25)

In [29]:
df_sales['month'] = df_sales['date'].dt.month_name()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sales['month'] = df_sales['date'].dt.month_name()


In [30]:
df_sales

Unnamed: 0,date,product_id,quantity,city,sales,month
0,NaT,P003,,,5389.96,
1,2023-01-04,P003,28.0,,6727.63,January
2,2023-02-11,P002,36.0,New York,8208.54,February
3,2023-02-03,,26.0,Houston,7094.52,February
4,2023-04-21,p001,8.0,Houston,6160.30,April
...,...,...,...,...,...,...
145,2023-02-01,,24.0,Chicago,9628.32,February
146,2023-03-22,P002,5.0,,1520.78,March
147,2023-04-20,p001,34.0,Houston,4550.70,April
148,2023-01-31,p001,6.0,Houston,1960.80,January


#### finding and dropping duplicates

In [31]:
df_sales[df_sales.duplicated()]

Unnamed: 0,date,product_id,quantity,city,sales,month


In [32]:
df_sales = df_sales.drop_duplicates()

In [33]:
df_sales.isnull().sum()

date          25
product_id    23
quantity      17
city          74
sales          0
month         25
dtype: int64