# Solutions:

## Module 1

In [None]:
# exercise 1:

df3 = df.groupby(['user_name'])[['id']].count()   
df3.sort_values(by=['id'], ascending=False).head(10)


# exercise 2:

NYdf = df.loc[df['user_location'].str.contains('New York', regex=False, na=False)]
df_day2 = NYdf.set_index('date').groupby(pd.Grouper(freq='D')).count()[['user_name']]

fig = plt.figure(figsize = (10, 5))

plt.plot(df_day2.index, df_day2.user_name)
plt.title('Number of daily tweets from New York')
plt.xticks(rotation=30, ha='right')
plt.xlabel('Date')
plt.ylabel('Tweets')

## Module 2

In [None]:
# exercise 1: Most retweeted tweet

# tweet with the most retweets
most_retweeted_tweet = tweets_by_account.loc[tweets_by_account.nretweets == tweets_by_account.nretweets.max()]
print('Most retweeted Tweet:')
most_retweeted_tweet['tweet'].item()


# exercise 2:

# visualize the date range of tweets collected

tweets_by_keyword_day = tweets_by_keyword.groupby(['date_only']).count()['id']
plt.figure(figsize=(10,8))
sns.barplot(x=tweets_by_keyword_day.index,
            y=tweets_by_keyword_day.values,
            color='c')
plt.title('VOLUME OF TWEETS')
plt.show()

## Module 3

In [None]:
# exercise 1: Remove Punctuation

corpus = [re.sub(r'[^\w\s]',' ',tweet) for tweet in corpus]


# exercise 2: Most frequent words on the day of Cambodia's Commune Elections

# select date of interest
tweets_df_date = tweets_df.loc[tweets_df.date == '2022-06-05']

# get a list of all words in the corpus
all_words = list(itertools.chain(*tweets_df_date.final_text))

# count how many times each word occurs
word_counts = collections.Counter(all_words)

# Top 10 most common words
word_counts.most_common(15)


# exercise 3: Wordcloud for the day of Cambodia's Commune Elections

# get tweets on 5 June 2022
tweets_df_date = tweets_df.loc[tweets_df.date == '2022-06-05']

# put all tweets in this timeframe together 
commune_election_tweets = " ".join([" ".join([word for word in tweet]) for tweet in tweets_df_date.final_text])

# visualize the wordcloud
wordcloud = WordCloud(max_words=120, background_color="white", width=1000, height=600).generate(commune_election_tweets)

plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Module 4

In [None]:
# Exercise: Model training using a disaster response dataset 
## TO DO: Save and load the model, and classify on a new text

def save_model(tfidf_vectorizer, model_dir, model_file_name,label_encoder):

    base_name = os.path.basename(model_file_name)
    base_name = os.path.splitext(base_name)[0]
    model_file = model_dir + "/" + base_name + ".hdf5"
    tokenizer_file = model_dir + "/" + base_name + ".tokenizer"
    label_encoder_file = model_dir + "/" + base_name + ".label_encoder"

    configfile = model_dir + "/" + base_name + ".config"
    configFile = open(configfile, "w")
    configFile.write("model_file=" + model_file + "\n")
    configFile.write("tokenizer_file=" + tokenizer_file + "\n")
    configFile.write("label_encoder_file=" + label_encoder_file + "\n")
    configFile.close()

    # serialize weights to HDF5
    with open(model_file, 'wb') as file:
        pickle.dump(classifier, file)

    # saving tokenizer
    with open(tokenizer_file, 'wb') as handle:
        pickle.dump(tfidf_vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving label_encoder
    with open(label_encoder_file, 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

model_dir="./model/"
!mkdir -p $model_dir
model_file_name="sentiment_data"
save_model(tfidf_vectorizer, model_dir, model_file_name, label_encoder)

def predict_single_item(loaded_model, tokenizer, label_encoder, text):
    data=[]
    data.append(text)
    x_feat = tokenizer.transform(data)
    dense = x_feat.toarray()
    if(dense[0].sum()==0.0):
        lab = "None"
        prob = 1.0
        return lab, prob
    result = loaded_model.predict(x_feat)
    result = label_encoder.inverse_transform(result)
    class_probabilities = loaded_model.predict_proba(x_feat)[0]

    prob_per_class_dictionary = dict(zip(label_encoder.classes_, class_probabilities))
    lab=result[0]
    prob = prob_per_class_dictionary.get(lab)

    return lab,prob


def read_config(configfile):
    configdict = {}
    with open(configfile, 'rU') as f:
        for line in f:
            line = line.strip()
            if (line == ""):
                continue
            row = line.split("=")
            configdict[row[0]] = row[1]
    return configdict

def load_models(config_dictionary):

    # Load from file
    with open(config_dictionary["model_file"], 'rb') as file:
        loaded_model = pickle.load(file)

    tokenizer_file = config_dictionary["tokenizer_file"]
    label_encoder_file = config_dictionary["label_encoder_file"]

    # loading tokenizer
    with open(tokenizer_file, 'rb') as handle:
        tokenizer = pickle.load(handle)

    # loading label_encoder
    with open(label_encoder_file, 'rb') as handle:
        label_encoder = pickle.load(handle)

    return loaded_model, tokenizer, label_encoder


config_file_name="/content/model/sentiment_data.config"
config_dictionary = read_config(config_file_name)    
loaded_model, tokenizer, label_encoder = load_models(config_dictionary)

## Module 5