In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv("reviews.csv")
df = pd.DataFrame(data)
df

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
0,301,201.0,1,5.0
1,302,202.0,3,4.0
2,303,203.0,2,3.0
3,304,204.0,5,
4,305,,4,5.0


In [3]:
df.head()

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
0,301,201.0,1,5.0
1,302,202.0,3,4.0
2,303,203.0,2,3.0
3,304,204.0,5,
4,305,,4,5.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ReviewID    5 non-null      int64  
 1   ProductID   4 non-null      float64
 2   CustomerID  5 non-null      int64  
 3   Rating      4 non-null      float64
dtypes: float64(2), int64(2)
memory usage: 232.0 bytes


In [5]:
df.describe()

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
count,5.0,4.0,5.0,4.0
mean,303.0,202.5,3.0,4.25
std,1.581139,1.290994,1.581139,0.957427
min,301.0,201.0,1.0,3.0
25%,302.0,201.75,2.0,3.75
50%,303.0,202.5,3.0,4.5
75%,304.0,203.25,4.0,5.0
max,305.0,204.0,5.0,5.0


In [6]:
np.shape(df)

(5, 4)

In [7]:
df['Rating'].fillna(df['Rating'].mean(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].mean(), inplace=True)


Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
0,301,201.0,1,5.0
1,302,202.0,3,4.0
2,303,203.0,2,3.0
3,304,204.0,5,4.25
4,305,,4,5.0


In [8]:
df["ProductID"] = df["ProductID"].interpolate()

In [9]:
np.median(df['Rating'])


4.25

In [10]:
np.mean(df['Rating'])

4.25

In [11]:
df['Rating'].min(), df['Rating'].max()


(3.0, 5.0)

In [12]:
df['Rating'].std()


0.82915619758885

In [13]:
df['Rating'].mean()


4.25

In [14]:
# Ratings greater than 4
df[df['Rating'] > 4]


Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
0,301,201.0,1,5.0
3,304,204.0,5,4.25
4,305,204.0,4,5.0


In [15]:
np.where(df['Rating'] > 4)


(array([0, 3, 4], dtype=int32),)

In [16]:
df.sort_values('Rating', ascending=False)


Unnamed: 0,ReviewID,ProductID,CustomerID,Rating
0,301,201.0,1,5.0
4,305,204.0,4,5.0
3,304,204.0,5,4.25
1,302,202.0,3,4.0
2,303,203.0,2,3.0


In [17]:
np.argsort(df['Rating'])[::-1]


4    4
3    0
2    3
1    1
0    2
Name: Rating, dtype: int32

In [18]:
df['Rating'].agg(['sum', 'mean', 'min', 'max'])


sum     21.25
mean     4.25
min      3.00
max      5.00
Name: Rating, dtype: float64

In [19]:
df.pivot_table(values='Rating', index='ProductID', aggfunc='mean')


Unnamed: 0_level_0,Rating
ProductID,Unnamed: 1_level_1
201.0,5.0
202.0,4.0
203.0,3.0
204.0,4.625


In [20]:
df2 = pd.DataFrame({'ProductID': [201, 202, 203], 'ProductName': ['Laptop', 'Smartphone', 'Tablet']})
merged_df = pd.merge(df, df2, on='ProductID')
merged_df

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating,ProductName
0,301,201.0,1,5.0,Laptop
1,302,202.0,3,4.0,Smartphone
2,303,203.0,2,3.0,Tablet


In [21]:
np.concatenate([df['Rating'].values, df['CustomerID'].values])


array([5.  , 4.  , 3.  , 4.25, 5.  , 1.  , 3.  , 2.  , 5.  , 4.  ])

In [22]:
df['Adjusted_Rating'] = df['Rating'] + 1  # For example, adding 1 to each rating
df

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating,Adjusted_Rating
0,301,201.0,1,5.0,6.0
1,302,202.0,3,4.0,5.0
2,303,203.0,2,3.0,4.0
3,304,204.0,5,4.25,5.25
4,305,204.0,4,5.0,6.0


In [23]:
df['Double_Rating'] = df['Rating'].apply(lambda x: x * 2)
df

Unnamed: 0,ReviewID,ProductID,CustomerID,Rating,Adjusted_Rating,Double_Rating
0,301,201.0,1,5.0,6.0,10.0
1,302,202.0,3,4.0,5.0,8.0
2,303,203.0,2,3.0,4.0,6.0
3,304,204.0,5,4.25,5.25,8.5
4,305,204.0,4,5.0,6.0,10.0


In [24]:
df['Rating'].value_counts()


Rating
5.00    2
4.00    1
3.00    1
4.25    1
Name: count, dtype: int64

In [25]:
np.histogram(df['Rating'], bins=5)


(array([1, 0, 1, 1, 2], dtype=int32), array([3. , 3.4, 3.8, 4.2, 4.6, 5. ]))