In [1]:
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import matplotlib as mpl
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from PIL import Image
import nltk
%matplotlib inline

In [6]:
def get_outliers(df):
    fig, ax = plt.subplots(2, 2)
    df_running_time = delete_missing_lines(df,['running_time'])
    sns.boxplot(x=df_running_time.running_time, ax=ax[0, 0], color='yellow')

    df_mpaa = delete_missing_lines(df,['mpaa'])
    sns.boxplot(x=df_mpaa.mpaa_cat, ax=ax[0, 1], color='green')

    df_worldwide_income = delete_missing_lines(df,['num_of_countries'])
    sns.boxplot(x=df_worldwide_income.num_of_countries, ax=ax[1, 0], color='blue')

    fig.set_size_inches(18.5, 10.5, forward=True)
    fig.set_dpi(105)


In [17]:
def get_vis_division_by_year(df):
    fig_year = plt.figure(figsize=(20, 6), dpi=105)
    fig_pie = fig_year.add_subplot(121)
    fig_hist = fig_year.add_subplot(122)
    fig_hist.hist(df.year,bins=50)
    year_values = df.year.value_counts()
    fig_hist.set_title("Divided by histogram")
    fig_hist.set_xlabel('Year')
    fig_hist.set_ylabel('Frequency')
    fig_pie.pie(year_values,labels=year_values.index,autopct='%1.2f%%')
    fig_pie.set_title("Divided by percentages")

In [20]:
def get_vis_division_by_genres(df):
    values = df['genres'].str.split('|').explode().value_counts()
    fig = plt.figure(figsize=(15, 6), dpi=100)
    fig_pie = fig.add_subplot(111)
    fig_pie.pie(values,labels=values.index,autopct='%1.f%%')
    fig_pie.set_title("Genres")

In [32]:
def get_scatter_plot(df):
    fig, axs = plt.subplots(3, 2)
    axs[0, 0].scatter(df.international_income, df.worldwide_income.astype(float),color='green')
    axs[0, 0].set_title('International income & Worldwide income')
    axs[0, 0].set(xlabel='International income', ylabel='Income(Billion)')
    
    axs[0, 1].scatter(df.domestic_income, df.worldwide_income.astype(float))
    axs[0, 1].set_title('Domestic income & Worldwide income')
    axs[0, 1].set(xlabel='Domestic income', ylabel='Worldwide income(Billion)')
    
    axs[1, 0].scatter(df.num_of_countries, df.worldwide_income.astype(float),color='salmon')
    axs[1, 0].set_title('Countries & Worldwide income')
    axs[1, 0].set(xlabel='Number of countries', ylabel='Income(Billion)')
    
    axs[1, 1].scatter(df.writer_sum, df.worldwide_income.astype(float),color='orange')
    axs[1, 1].set_title('Number of writers & Worldwide income')
    axs[1, 1].set(xlabel='Number of writers', ylabel='Income(Billion)')
    
    axs[2, 0].scatter(df.running_time, df.worldwide_income.astype(float),color='purple')
    axs[2, 0].set_title('Running time & Worldwide income')
    axs[2, 0].set(xlabel='Running time(minutes)', ylabel='Income(Billion)')
    
    fig.set_size_inches(15, 20, forward=True)
    fig.set_dpi(150)

In [40]:
def get_two_3D_scaater_plot(df,x1_name,y1_name,z1_name,x2_name="",y2_name="",z2_name=""):
    fig = plt.figure(figsize=plt.figaspect(0.5))

    ax = fig.add_subplot(1, 2, 1, projection='3d')
    X = df[x1_name]
    Y = df[y1_name]
    Z = df[z1_name]
    ax.set_xlabel(x1_name)
    ax.set_ylabel(y1_name)
    ax.set_zlabel(z1_name)
    ax.set_title("Correlation -" + x1_name +  " & " +  y1_name + " & "  + z1_name)
    ax.scatter3D(X, Y, Z, c=Z,depthshade=False)

    # ==============
    # Second subplot
    # ==============
    if x2_name != "":
        ax = fig.add_subplot(1, 2, 2, projection='3d')
        X = df[x2_name]
        Y = df[y2_name]
        Z = df[z2_name]
        ax.set_xlabel(x2_name)
        ax.set_ylabel(y2_name)
        ax.set_zlabel(z2_name)
        ax.set_title("Correlation -" + x2_name +  " & " +  y2_name + " & "  + z2_name)
        ax.scatter3D(X, Y, Z, c=Z,depthshade=False)

    fig.set_size_inches(15, 20, forward=True)
    fig.set_dpi(150)
    plt.show()

In [66]:
def get_bar_vis_text_data(df,coll_name):
    freqDist = get_freqDist(df,coll_name)
    topK=20
    plt.bar(list(freqDist.keys())[:topK], list(freqDist.values())[:topK])
    plt.xticks(list(freqDist.keys())[:topK], rotation='vertical')
    plt.title(coll_name)
    plt.show()

In [62]:
def get_wordcloud_vis_text_data(df,coll_name):
    freqDist = get_freqDist(df,coll_name)
    wordcloud = WordCloud(max_font_size=50, max_words=75, background_color="white").generate_from_frequencies(freqDist)
    plt.figure(figsize=(8, 6), dpi=100)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [53]:
def export_text_from_col(df,coll_name):
    lines = []
    for row in df[coll_name]:
        if type(row) is float:
            continue
        row = row.strip().lower()
        lines.append(row)
    text = " ".join(lines)
    return text

In [54]:
def get_freqDist(df,coll_name):
    text = export_text_from_col(df,coll_name)
    vectorizer=CountVectorizer(stop_words="english")
    freq_vec=vectorizer.fit_transform([text]).toarray() # the function requires as input an array of docs..
    freqDist=dict(zip(vectorizer.get_feature_names(),freq_vec[0])) # create a dictionary from the first (and only) row
    freqDist=dict(sorted(freqDist.items(), key=lambda item: item[1],reverse=True)) # ascending sort..
    return freqDist