In [1]:
from ipywidgets import interact, fixed
import pandas as pd
import zipfile, requests, io
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
import re
from wordcloud import WordCloud
from nltk.corpus import stopwords


In [2]:
def extract_full_df(url_link, file, limit):
    
    r = requests.get(url_link)
    
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()
    
    df = pd.read_csv(z.open(file))
    df = shuffle(df)
    df = df[:limit]
    
    df = df[pd.notnull(df['Consumer complaint narrative'])]
    
    df = df.astype(str)
    
    return df

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
df = extract_full_df('http://files.consumerfinance.gov/ccdb/complaints.csv.zip',
                'complaints.csv', 800000)

In [5]:
def top_100_companies(frame):
    
    full_list = sorted(list(dict(frame['Company'].value_counts()[:100]).keys()))
    
    frame = frame[frame['Company'].isin(full_list)]
    
    v_groupby = {}

    frame['Year'] = [date[:7] for date in frame['Date received']]

    frame['Company'] = frame['Company'].str.upper()
    
    frame['Match Key'] = frame['Year'] + frame['Company']

            # group your chosen parameter by year in a dictionary
    groupby = dict(frame.groupby('Year')['Company'].apply(list))
    
    complaint_groupby = dict(frame.groupby('Match Key')['Consumer complaint narrative'].apply(list))
    
    complaint_groupby = {k: " ".join(v) for k, v in complaint_groupby.items()}
    
    for k, v in groupby.items():
        c = Counter(v)
        v_groupby[k] = c.most_common()

            # plot the figure
    df = pd.DataFrame([(k, *t) for k, v in v_groupby.items() for t in v], columns=['month','company','complaints'])
    
    df['Match Key'] = df['month'] + df['company']
    
    df['complaints_description'] = df['Match Key'].map(complaint_groupby)
    
    df['complaints_description'] = df['complaints_description'].str.lower()
    
    df = df.drop('Match Key', axis = 1)
    
    df['complaints_description'] = df['complaints_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    
    df['complaints_description'] = [re.sub(r'x{2,}','', x) for x in df['complaints_description']]

    return df

In [6]:
df1 = top_100_companies(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [7]:
@interact
def line_plot(column = fixed('company'), company = sorted(df1['company'].unique())):
    
    wordcloud = WordCloud(width=800, height=400).generate(" ".join(df1[df1['company'] == company]['complaints_description']))
    
    plt.figure(figsize=(12,6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    df1[df1['company'] == company].set_index('month').plot(title = "COMPLAINTS FOR {} OVER TIME".format(company),
                                                                figsize = (12,6))

interactive(children=(Dropdown(description='company', options=('ACE CASH EXPRESS, INC.', 'AD ASTRA RECOVERY SE…