In [None]:
import pandas as pd
import numpy as np
import re
import math
import matplotlib.pyplot as plt
import seaborn as sns

## load movie data

In [None]:
org_data = pd.read_csv('/kaggle/input/movies-dataset-from-piracy-website/movies_dataset.csv')

## 数据摘要
### 标称数据包括 appropriate_for director industry language storyline title id 以及 writer
### 其中有意义的包括 appropriate_for director industry language 以及 writer
### storyline title id 三者几乎是unique 但数据存在问题 存在部分重复样本
### 
### 数值数据包括 imdb_rating downloads posted_date release_date run_time views
### 都具有一定意义

In [None]:
nominals = ['appropriate_for', 'director',  'industry', 'language', 'writer']
numerics = ['imdb_rating', 'downloads', 'posted_date', 'release_date', 'run_time', 'views']
# drop first col
data = org_data.drop(columns='Unnamed: 0')
# Convert column names into snake_case.
data.columns = data.columns.str.replace('-', '_').str.lower()
# data = data.iloc[[2]]

# Make views and dowloads numeric.
for col in 'downloads', 'views':
    data[col] = data[col].str.replace(',','')
    data[col] = data[col].astype('float')
# Make id strings.
data['id'] = data['id'].astype('str')
# Output formte
pd.options.display.float_format = '{:.2f}'.format
def run_time_process(e):
    e = str(e).replace(' ', '')
    if e == 'nan':
        return np.nan
    if 'h' not in e and 'min' not in e:
        return int(e)
    else:
        hour = 0
        minute = 0
        if 'h' in e:
            hour = int(e.split('h')[0])
        if 'min' in e:
            minute = int(e.split('min')[0].split('h')[-1])
        return int(hour * 60 + minute)
## Run time process
# Convert '1h20min' format to minutes
data['run_time'] = data['run_time'].apply(run_time_process)
# Convert '102' format to minutes
data['run_time'] = pd.to_numeric(data['run_time'], errors='coerce').fillna(np.nan)
# Make dates datetime.
data['old_posted_date'] = data['posted_date']
data['posted_date'] = pd.to_datetime(data['posted_date'])

data['old_release_date'] = data['release_date']
data['release_date'] = pd.to_datetime(data['release_date'])
data

## 标称属性
### 标称属性的缺失值的个数

In [None]:
ax = nominals
ay = []
for attr in nominals:
    freq = 5
    ay.append(data[attr].isna().sum())
plt.bar(ax, ay)
plt.title(f'nan value counts')

### 标称属性的每个可能取值的频数
#### 由.value_counts()取得，这里仅展示频度前五

In [None]:
index = 1
plt.figure(figsize=(10,10), dpi=80).subplots_adjust(hspace=1)
plt.figure(1)
col = 2
row = int(len(nominals) / col) + 1
for attr in nominals:
    plt.subplot(row, col, index)
    index += 1
    freq = 10
    data[attr].value_counts().head(freq).plot.bar()
    plt.title(f'{attr}: frequency of top {freq}')

## 数值属性
### 数值属性的缺失值个数

In [None]:
ax = range(len(numerics))
ay = []
for attr in numerics:
    freq = 5
    ay.append(data[attr].isna().sum())
plt.bar(ax, ay)
plt.xticks(ax, numerics, rotation=45)
plt.title(f'nan value counts')

## 数值属性的五数、盒图
### 
### release date
####     处理时 仅关注年份信息

In [None]:
attr = 'release_date'
print(data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### posted date
####     处理时 仅关注年份信息

In [None]:
attr = 'posted_date'
print(data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### imdb rating

In [None]:
attr = 'imdb_rating'
print(data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### downloads

In [None]:
attr = 'downloads'
print(data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### run_time
#### 单位为分钟

In [None]:
attr = 'run_time'
print(data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### views

In [None]:
attr = 'views'
print(data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

## 缺失值处理
### 
### 剔除
#### 剔除后仅剩9902条数据 远少于原数据量20548

In [None]:
new_data = data.dropna()
new_data

#### 标称属性变化

In [None]:
index = 1
plt.figure(figsize=(10,10), dpi=80).subplots_adjust(hspace=1)
plt.figure(1)
col = 2
row = int(len(nominals) / col) + 1
for attr in nominals:
    plt.subplot(row, col, index)
    index += 1
    freq = 10
    new_data[attr].value_counts().head(freq).plot.bar()
    plt.title(f'{attr}: frequency of top {freq}')

#### 数值属性变化

In [None]:
attr = 'release_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'posted_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'imdb_rating'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'downloads'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'run_time'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'views'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### 最高频率值填补

In [None]:
attrs = nominals + numerics
new_data = data.copy(deep=True)
for attr in attrs:
    most = data[attr].value_counts().index[0]
    new_data[attr] = data[attr].fillna(most)
new_data

#### 标称属性变化

In [None]:
index = 1
plt.figure(figsize=(10,10), dpi=80).subplots_adjust(hspace=1)
plt.figure(1)
col = 2
row = int(len(nominals) / col) + 1
for attr in nominals:
    plt.subplot(row, col, index)
    index += 1
    freq = 10
    new_data[attr].value_counts().head(freq).plot.bar()
    plt.title(f'{attr}: frequency of top {freq}')

#### 数值属性变化

In [None]:
attr = 'release_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'posted_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'imdb_rating'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'downloads'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'run_time'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'views'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### 相关关系填补

In [None]:
new_data = data.copy(deep=True)
corr_matrix = new_data.corr()
corr_matrix

#### 虽然downloads与view呈现高相关性 但数据集中缺失downloads的数据也同时缺失views(仅一条 index 149)
#### 此外imdb_rating的缺失都伴随着run_time download的缺失 同时views数据与其不存在明显的关系 因此无法利用此方法填补缺失值
#### 对于run_time数据缺失使用downloads预测
#### 使用随机森林算法

In [None]:
from sklearn.ensemble import RandomForestRegressor
data_map = new_data[['run_time', 'downloads']].dropna()
rfr_1 = RandomForestRegressor(random_state=0, n_estimators=200,  n_jobs=-1)
matrix = data_map.values
X = matrix[:, 0].reshape(-1,1)
y = matrix[:, 1]
rfr_1.fit(X, y)
data_map = new_data[['run_time', 'downloads']].dropna(subset=['downloads'])
X = data_map[data_map.run_time.isnull()].values[:, 1].reshape(-1, 1)
prediction = rfr_1.predict(X)
new_data.loc[(new_data['run_time'].isna() & new_data['downloads'].notna()), 'run_time'] = prediction

new_data

#### 标称属性变化

In [None]:
index = 1
plt.figure(figsize=(10,10), dpi=80).subplots_adjust(hspace=1)
plt.figure(1)
col = 2
row = int(len(nominals) / col) + 1
for attr in nominals:
    plt.subplot(row, col, index)
    index += 1
    freq = 10
    new_data[attr].value_counts().head(freq).plot.bar()
    plt.title(f'{attr}: frequency of top {freq}')

#### 数值属性变化

In [None]:
attr = 'release_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'posted_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'imdb_rating'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'downloads'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'run_time'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'views'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()

### 基于相似性 
#### 利用impyute工具 对几个数值属性进行填补

In [None]:
pip install impyute

In [None]:
from impyute import fast_knn
features = ['imdb_rating', 'views', 'downloads', 'run_time']
new_data = data.copy(True)
new_data[features] = pd.DataFrame(fast_knn(np.array(new_data[features]), k=2), columns=features)
new_data.isnull().any()

#### 标称属性变化

In [None]:
index = 1
plt.figure(figsize=(10,10), dpi=80).subplots_adjust(hspace=1)
plt.figure(1)
col = 2
row = int(len(nominals) / col) + 1
for attr in nominals:
    plt.subplot(row, col, index)
    index += 1
    freq = 10
    new_data[attr].value_counts().head(freq).plot.bar()
    plt.title(f'{attr}: frequency of top {freq}')

#### 数值属性变化

In [None]:
attr = 'release_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'posted_date'
print(attr)
print(new_data[attr].dt.year.describe())
visit = pd.DataFrame(data[attr].dt.year)
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'imdb_rating'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'downloads'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'run_time'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()
attr = 'views'
print(attr)
print(new_data[attr].describe())
visit = pd.DataFrame(data[attr])
visit.plot.box()
plt.title(f'{attr}: box')
plt.show()