In [1]:
import random
import pickle
from nltk.corpus import twitter_samples,stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import NaiveBayesClassifier
from nltk import classify

def clean_data(token):
    return [item for item in token if not item.startswith('@') and not item.startswith('http')]

def to_lower(token):
    return [item.lower() for item in token]

def lemmatize(token):
    lemmatizer = WordNetLemmatizer()

    result = []
    for item, tag in pos_tag(token):
        if tag[0].lower() in "nva":
            result.append(lemmatizer.lemmatize(item, tag[0].lower()))
        else:
            result.append((lemmatizer.lemmatize(item)))

    return result

def remove_stop_words(token, stop_words):
    return [item for item in token if item not in stop_words]

def transform_features(token):
    feature_set = {}
    for feature in token:
        if feature not in feature_set:
            feature_set[feature] = 0
        feature_set[feature] += 1
    return feature_set

def main():
    # Step 1: Gather data
    positive_tweets = twitter_samples.tokenized('positive_tweets.json')
    negative_tweets = twitter_samples.tokenized('negative_tweets.json')
    print(positive_tweets[0])
    print(negative_tweets[0])

    # Step 2: Clean, lemmatize and remove stop words from data
    stop_words = stopwords.words('english')
    positive_tweets = [remove_stop_words(lemmatize(clean_data(to_lower(item))), stop_words) for item in positive_tweets]
    negative_tweets = [remove_stop_words(lemmatize(clean_data(to_lower(item))), stop_words) for item in negative_tweets]
    print(positive_tweets[0])
    print(negative_tweets[0])

    # Step 3: Transform data
    positive_tweets = [(transform_features(token), "Positive") for token in positive_tweets]
    negative_tweets = [(transform_features(token), "Negative") for token in negative_tweets]
    print(positive_tweets[0])
    print(negative_tweets[0])

    # Step 4: Create data set
    dataset = positive_tweets + negative_tweets
    random.shuffle(dataset)

    training_data = dataset[:7000]
    test_data = dataset[7000:]

    # Step 5: Train the model
    classifier = NaiveBayesClassifier.train(training_data)

    # Step 6: Test accuracy
    print("Accuracy:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))

    # Step 7: Save the model
    with open("my_classifier.pickle", "wb") as f:
        pickle.dump(classifier, f)

if __name__ == "__main__":
    main()


['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']
['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']
['hopeless', 'tmr', ':(']
({'#followfriday': 1, 'top': 1, 'engage': 1, 'member': 1, 'community': 1, 'week': 1, ':)': 1}, 'Positive')
({'hopeless': 1, 'tmr': 1, ':(': 1}, 'Negative')
Accuracy: 0.9953333333333333
Most Informative Features
                      :( = 1              Negati : Positi =   2018.2 : 1.0
                      :) = 1              Positi : Negati =    961.4 : 1.0
                       ( = 2              Negati : Positi =     40.0 : 1.0
                       ) = 2              Positi : Negati =     30.9 : 1.0
                follower = 1              Positi : Negati =     24.5 : 1.0
                     x15 = 1              Negati : Positi =     17.7 : 1.0
                    glad = 1              Positi :

In [None]:
!pip install folium
!pip install geopy
!pip install geopandas



In [4]:
import csv
import json
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim


class MoodClassifier:
    def __init__(self, classifier_file='my_classifier.pickle'):
        f = open(classifier_file, 'rb')
        self.classifier = pickle.load(f)
        f.close()
        self.stop_words = stopwords.words('english')

    def clean_data(self, token):
        return [item for item in token if not item.startswith('@') and not item.startswith('http')]

    def to_lower(self, token):
        return [item.lower() for item in token]

    def lemmatize(self, token):
        lemmatizer = WordNetLemmatizer()

        result = []
        for item, tag in pos_tag(token):
            if tag[0].lower() in "nva":
                result.append(lemmatizer.lemmatize(item, tag[0].lower()))
            else:
                result.append((lemmatizer.lemmatize(item)))

        return result

    def remove_stop_words(self, token):
        return [item for item in token if item not in self.stop_words]

    def transform_features(self, token):
        feature_set = {}
        for feature in token:
            if feature not in feature_set:
                feature_set[feature] = 0
            feature_set[feature] += 1
        return feature_set

    def get_mood(self, token):
        custom_tokens = self.remove_stop_words(self.lemmatize(self.clean_data(self.to_lower(word_tokenize(token)))))
        category = self.classifier.classify(self.transform_features(custom_tokens))
        return category


class Locator:
    def __init__(self):
        self.geo_locator = Nominatim(user_agent="LearnPython")
        self.location_store = {}
        self.lookups = 0

    def get_location(self, location_name):
        if location_name in self.location_store:
            return self.location_store[location_name]
        try:
            self.lookups += 1
            location = self.geo_locator.geocode(location_name, language='en')
            self.location_store[location_name] = location
        except GeocoderTimedOut:
            location = None
        return location


def process(input_file, output_file):
    tweets = None
    with open(input_file) as f:
        tweets = json.load(f)

    print("Number of tweets", len(tweets))

    classifier = MoodClassifier()
    locator = Locator()
    cnt = 0

    csv_data = []
    for tweet in tweets:
        csv_data_item = {'mood': None, 'location': None, 'latitude': None, 'longitude': None}
        if 'retweeted_status' in tweet:
            tweet = tweet['retweeted_status']
        csv_data_item['mood'] = classifier.get_mood(tweet['full_text'])

        if 'location' in tweet['user']:
            location = locator.get_location(tweet['user']['location'])
            if location:
                csv_data_item['latitude'] = location.latitude
                csv_data_item['longitude'] = location.longitude
                csv_data_item['location'] = str(location.address).split(', ')[-1]

        csv_data.append(csv_data_item)
        
        cnt += 1
        if cnt > 100:
            break

    print("All processed, lookups:", locator.lookups)

    keys = csv_data[0].keys()
    with open(output_file, 'w') as f:
        dict_writer = csv.DictWriter(f, keys)
        dict_writer.writeheader()
        dict_writer.writerows(csv_data)


if __name__ == "__main__":
    input_file = "tweets_with_python.json"
    output_file = "mood_data_python.csv"
    process(input_file, output_file)

    input_file = "tweets_with_java.json"
    output_file = "mood_data_java.csv"
    process(input_file, output_file)


Number of tweets 23306
All processed, lookups: 59
Number of tweets 18541
All processed, lookups: 57


In [10]:
import csv
import folium
import geopandas
import pandas as pd
from folium.plugins import FastMarkerCluster


def load_csv_file(csv_file):
    content = []
    with open(csv_file) as f:
        reader = csv.DictReader(f)
        for row in reader:
            content.append(row)
    return content


def create_map(csv_file, output_html):
    mood_content = load_csv_file(csv_file)

    # Classify each mood_content item in country locations
    mood_location = {}
    for item in mood_content:
        if item['location'] not in mood_location:
            mood_location[item['location']] = {'Positive': 0, 'Negative': 0}
        mood_location[item['location']][item['mood']] += 1

    my_map = folium.Map()

    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

    country_names = []
    moods = []
    for country in mood_location:
        mood = mood_location[country]['Positive']/(mood_location[country]['Positive'] + mood_location[country]['Negative'])
        moods.append(mood)
        country_names.append(country)

    data_to_plot = pd.DataFrame({'Country': country_names, 'Mood': moods})

    folium.Choropleth(
        geo_data=world,
        name='choropleth',
        data=data_to_plot,
        columns=['Country', 'Mood'],
        key_on='feature.properties.name',
        fill_color='YlGn',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='Mood'
    ).add_to(my_map)

    cluster_data =[]
    for row in mood_content:
        if row['latitude'] != '':
            cluster_data.append([float(row['latitude']), float(row['longitude'])])

    FastMarkerCluster(cluster_data).add_to(my_map)

    folium.LayerControl().add_to(my_map)

    my_map.save(output_html)


if __name__ == "__main__":
    create_map("mood_data_java.csv", "mood_java.html")
    create_map("mood_data_python.csv", "mood_python.html")

ModuleNotFoundError: No module named 'geopandas'

In [5]:
# pip install geopandas

Collecting geopandas
  Downloading geopandas-0.11.0-py3-none-any.whl (1.0 MB)Note: you may need to restart the kernel to use updated packages.


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\samda\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\samda\\AppData\\Local\\Temp\\pip-install-9tm11lus\\fiona_23ed2648a91744138b17c5646c297a04\\setup.py'"'"'; __file__='"'"'C:\\Users\\samda\\AppData\\Local\\Temp\\pip-install-9tm11lus\\fiona_23ed2648a91744138b17c5646c297a04\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\samda\AppData\Local\Temp\pip-pip-egg-info-36n51vle'
         cwd: C:\Users\samda\AppData\Local\Temp\pip-install-9tm11lus\fiona_23ed2648a91744138b17c5646c297a04\
    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
    ERRO


Collecting fiona>=1.8
  Using cached Fiona-1.8.21.tar.gz (1.0 MB)
  Using cached Fiona-1.8.20.tar.gz (1.3 MB)
  Using cached Fiona-1.8.19.tar.gz (1.3 MB)
  Using cached Fiona-1.8.18.tar.gz (1.3 MB)
  Using cached Fiona-1.8.17.tar.gz (1.3 MB)
  Using cached Fiona-1.8.16.tar.gz (1.3 MB)
  Using cached Fiona-1.8.15.tar.gz (1.3 MB)
  Using cached Fiona-1.8.14.tar.gz (1.3 MB)
  Using cached Fiona-1.8.13.post1.tar.gz (1.2 MB)
  Using cached Fiona-1.8.13.tar.gz (1.2 MB)
  Using cached Fiona-1.8.12.tar.gz (1.2 MB)
  Using cached Fiona-1.8.11.tar.gz (1.2 MB)
  Using cached Fiona-1.8.10.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.post2.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.post1.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.tar.gz (1.2 MB)
  Using cached Fiona-1.8.8.tar.gz (1.7 MB)
  Using cached Fiona-1.8.7.tar.gz (1.7 MB)
  Using cached Fiona-1.8.6.tar.gz (1.7 MB)
  Using cached Fiona-1.8.5.tar.gz (1.7 MB)
  Using cached Fiona-1.8.4.tar.gz (1.1 MB)
  Using cached Fiona-1.8.3.tar.gz (1.1 MB)


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\samda\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\samda\\AppData\\Local\\Temp\\pip-install-9tm11lus\\fiona_877742fe3875484bb067df3faa8db850\\setup.py'"'"'; __file__='"'"'C:\\Users\\samda\\AppData\\Local\\Temp\\pip-install-9tm11lus\\fiona_877742fe3875484bb067df3faa8db850\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\samda\AppData\Local\Temp\pip-pip-egg-info-wvixfipb'
         cwd: C:\Users\samda\AppData\Local\Temp\pip-install-9tm11lus\fiona_877742fe3875484bb067df3faa8db850\
    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
    ERRO