In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import pickle
import json
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
import pandas as pd

In [None]:
class moodClassifier:
    
    def __init__(self, classifier_file = 'model.pkl'):
        f = open(classifier_file, 'rb')
        self.classifier = pickle.load(f)
        f.close()
        self.stop_words = stopwords.words('english')
        
    def clean_data(self, token):
        return [item.lower() for item in token if not item.startswith('@') and not item.startswith('http')]

    def lemmatize(self, token):
        lemmatizer = WordNetLemmatizer()

        result = []
        for item, tag in pos_tag(token):
            if tag[0].lower() in "nva":
                result.append(lemmatizer.lemmatize(item, tag[0].lower()))
            else:
                result.append(lemmatizer.lemmatize(item))
        return result

    def remove_stop_words(self, token):
        stop_words = stopwords.words('english')
        return [item for item in token if item not in stop_words]

    def transform_features(self, token):
        features_set = {}
        for feature in token:
            if feature not in features_set:
                features_set[feature] = 0
            features_set[feature] = 1
        return features_set
    
    
    def get_mood(self, token):
        custom_tokens = self.remove_stop_words(self.lemmatize(self.clean_data(word_tokenize(token))))
        category = self.classifier.classify(self.transform_features(custom_tokens))
        return category
        

In [None]:
class locator:
    def __init__(self):
        self.geo_locator = Nominatim(user_agent="LearnPython")
        self.location_store = {}
        self.lookups = 0
        
    def get_location(self, location_name):
        if location_name in self.location_store:
            return self.location_store[location_name]
        try:
            self.lookups +=1
            location = self.geo_locator.geocode(location_name, language='en')
            self.location_store[location_name] = location
        except GeocoderTimedOut:
            location = None
        return location

In [None]:
def process(input_file, output_file):
    tweets = None
    with open(input_file) as f:
        tweets = json.load(f)
    print(len(tweets))
    
    classifier = moodClassifier()
    
    csv_data = []
    
    for tweet in tweets[:10]:
        csv_data_item = {'mood': None, 'location': None, 'latitude': None, 'longitude': None}
        
        if 'retweeted_status' in tweet:
            tweet = tweet['retweeted_status']
        csv_data_item['mood'] = classifier.get_mood(tweet['full_text'])
    
        if 'location' in tweet['user']:
            s = tweet['user']['location']
            location = locator().get_location(s)
            if location:
                csv_data_item['latitude'] = location.latitude
                csv_data_item['longitude'] = location.longitude
                csv_data_item['location'] = str(location.address).split(", ")[-1]
    
        csv_data.append(csv_data_item)
    for itm in csv_data:
        print(itm)
    keys = csv_data[0].keys()    
    df = pd.DataFrame(csv_data, columns=keys)
    df.to_csv(output_file, index=False)
#     print(stat)
#     print(stat["Positive"]/(stat["Positive"]+stat["Negative"]))

In [None]:
input_file = "tweets_with_python.json"
process(input_file, "mydata.csv")

In [3]:
DF = pd.read_csv("mydata.csv")
DF

Unnamed: 0,mood,location,latitude,longitude
0,Negative,,,
1,Negative,,,
2,Positive,United States,37.788497,-122.355847
3,Positive,Malaysia,5.942814,116.078855
4,Positive,United States,39.78373,-100.445882
5,Negative,United Kingdom,53.479489,-2.245115
6,Negative,United States,37.779026,-122.419906
7,Positive,,,
8,Positive,Argentina,-34.996496,-64.967282
9,Negative,Nigeria,5.792749,6.121652
