# Answer Questions with NoSQL

In [2]:
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient

We transformed our MySQL database into MongoDB in a way that they share the same csv tables that we use to import into database as our tables.
In this way,the database in MySQL & MongoDB are completely the same.

We import those csv form of data into MongoDB using Command Line,which is clarified in the README in NoSQL File.
Now, we will begin answer question using NoSQL database.

# Connect to MongoDB

In [5]:
client = MongoClient('mongodb://localhost:27017')
db = client.movies
movieMain = db.movieMain
movieActor = db.movieActor
movieDirector = db.movieDirector
moviePerformance = db.moviePerformance
twitterTag = db.taginfo
twitterUserClean = db.twitterUserClean
youtubeData = db.youtubeData

## 1. What are people saying about me ?

In [8]:
# Show tags in Twitter and YouTube about a single movie
uc1_2 =twitterTag.aggregate([
 
    {
        "$lookup":
        {
            "from":"youtubeData",
            "localField":"movie_name",
            "foreignField":"movie_name",
            "as":"movieTag"
        }
    },
    
     {"$unwind": "$movieTag"},
                                                
     {
         "$match":{"movie_name":"Fantastic Beasts and Where to Find Them"}
     },
    {
        "$project":{"tag":1,"movieTag.tag":1}
    }
    ])
print(list(uc1_2))

[{'_id': ObjectId('5ad3662406651e8d9521d3af'), 'tag': 'FantasticBeastsAndWhereToFindThem', 'movieTag': {'tag': 'Fantastic Beasts: The Crimes of Grindelwald FULL\'M.o.V.i.E\'2018\'HD\'"'}}, {'_id': ObjectId('5ad3662406651e8d9521d3b0'), 'tag': 'Vidimovie', 'movieTag': {'tag': 'Fantastic Beasts: The Crimes of Grindelwald FULL\'M.o.V.i.E\'2018\'HD\'"'}}, {'_id': ObjectId('5ad3662406651e8d9521d3b1'), 'tag': 'contentmarketing', 'movieTag': {'tag': 'Fantastic Beasts: The Crimes of Grindelwald FULL\'M.o.V.i.E\'2018\'HD\'"'}}]


## 2. How viral are my posts ?

In [9]:
# Using retweet count to indicate the spread of the post 
uc2 =twitterTag.aggregate([
    {"$match":{"movie_name":"La La Land"}},
    {"$group":{'_id':{'user_id':"$user_id",'retweet_count':"$retweet_count"}}},
    {"$project":{'retweet_count':1,'user_id':1}}
])
print(list(uc2))

[{'_id': {'user_id': 981237161113595000, 'retweet_count': 1}}, {'_id': {'user_id': 981237086870151000, 'retweet_count': 1}}, {'_id': {'user_id': 981237672470396000, 'retweet_count': 114}}]


## 3. How much influence to my posts have ?

In [3]:
# First we normalize the data and then we use weighting of follwer count, retweet count and favorite count to show the influence of the post.
uc3=twitterTag.aggregate([
    {"$lookup":
     {
         "from":"twitterUserClean",
         "localField":"user_id",
         "foreignField":"user_id",
         "as":"result"
     }
    },
    {"$unwind":"$result"},
    {
        "$group":{
            "_id":"$result.user_name",
            "avg_influence":{"$avg":
                             {"$add":
                             [{"$multiply":[{"$divide":["$result.follower_count",1242207]},0.5]},
                              {"$multiply":[{"$divide":["$retweet_count",283809]},0.3]},
                              {"$multiply":[{"$divide":["$favorite_count",14]},0.2]}]
                             }
                            }
        }
    },
    {"$sort":{
        "avg_influence":-1
    }}
])
print(list(uc3))

[{'_id': 'Shinobi Ninja', 'avg_influence': 0.5}, {'_id': 'M. Saad Arslan Sadiq', 'avg_influence': 0.43609126832095}, {'_id': 'The Ringer', 'avg_influence': 0.2054221907379523}, {'_id': 'Hutch', 'avg_influence': 0.1628933940041509}, {'_id': 'Portal Games', 'avg_influence': 0.12172127051748563}, {'_id': 'Bibo', 'avg_influence': 0.11441169972936417}, {'_id': 'Gobierno Digital', 'avg_influence': 0.10225123196352753}, {'_id': 'J�r�me Voiturier', 'avg_influence': 0.10008986114807142}, {'_id': 'r�dio grenal', 'avg_influence': 0.09075505951619844}, {'_id': 'Tim Hughes ??�??', 'avg_influence': 0.0805536541603623}, {'_id': 'Cinema Guide', 'avg_influence': 0.07928932709512758}, {'_id': 'Ethan Silver', 'avg_influence': 0.07694128273307106}, {'_id': 'GK Edits', 'avg_influence': 0.0730832386625699}, {'_id': 'Roberto Blizzard', 'avg_influence': 0.07191379076572124}, {'_id': '??TheHiddenPages.com', 'avg_influence': 0.07178030714687648}, {'_id': 'Bold', 'avg_influence': 0.07153618996731143}, {'_id': 'T

## 4. What posts are like mine ?

### Alike posts in Twitter

In [29]:
# First we find tags in Twitter data which are mentioned more than one time
uc4=twitterTag.aggregate([
    {"$match":{"movie_name":"Mindhorn"}},
    {"$group":{'_id':"$tag",'count':{"$sum":1}}},
    {"$match":{'count':{"$gt":1}}},
    {"$sort":{'count':-1}},
])
print(list(uc4))

[{'_id': 'Mindhorn', 'count': 4}, {'_id': 'Movie', 'count': 3}]


In [12]:
# Next, find the twitter id whose content contains those tags
uc4_2=twitterTag.distinct('twitter_id',{'movie_name':"Mindhorn",'tag':"Mindhorn",'tag':"Movie"})
print(list(uc4_2))

[981181550527205000, 980799092723998000, 980484071389351000]


### Alike videos in Youtube

In [25]:
# First we find tags in YouTube data which are mentioned more than one time
uc4_3=youtubeData.aggregate([
    {"$match":{"movie_name":"Nocturnal Animals"}},
    {"$group":{'_id':"$tag",'count':{"$sum":1}}},
    {"$match":{'count':{"$gt":1}}},
    {"$sort":{'count':-1}},
])
print(list(uc4_3))

[{'_id': "suhendar23418'", 'count': 2}]


In [29]:
# Next, find the video id whose content contains those tags
uc4_4=youtubeData.distinct('video_Id',{'movie_name':"Nocturnal Animals",'tag':"suhendar23418'"})
print(list(uc4_4))

['g9iW2UgPYPk', 'QnvMDsjznxg']


## 5. What topics are trending in my domain ?

### Trend in Twitter

In [4]:
# We are trying to use tag amount to find hot topic within an hour
uc5 =twitterTag.aggregate([
    {"$match":{"time":{"$regex":"^18"}}},
    {"$group":{'_id':"$movie_name",'count':{"$sum":1}}},
    {"$sort":{"count":-1}},
    {"$limit":3}
])
print(list(uc5))

[{'_id': 'Boyhood', 'count': 39}, {'_id': 'Thor', 'count': 27}, {'_id': 'Bridesmaids', 'count': 27}]


### Trend in YouTube

In [10]:
# We are trying to use tag amount to find hot topic within an hour
uc5 =youtubeData.aggregate([
    {"$match":{"publish_time":{"$regex":"^18"}}},
    {"$group":{'_id':"$movie_name",'count':{"$sum":1}}},
    {"$sort":{"count":-1}},
    {"$limit":3}
])
print(list(uc5))

[{'_id': 'Lavender', 'count': 7}, {'_id': 'Unstoppable', 'count': 6}, {'_id': 'Centurion', 'count': 5}]


## 6. What users post like me ?

### Alike users in Twitter

In [4]:
# First we find tags in Twitter data which are mentioned more than one time
uc6_1=twitterTag.aggregate([
    {"$match":{"movie_name":"The Accountant"}},
    {"$group":{'_id':"$tag",'count':{"$sum":1}}},
    {"$match":{'count':{"$gt":1}}},
    {"$sort":{'count':-1}},
])
print(list(uc6_1))

[{'_id': 'Boston', 'count': 2}, {'_id': 'Senior', 'count': 2}]


In [16]:
# Then, we use those tags to find users in twitter user table
# Join twitterTag and twitterUserClean tables together
uc6_2 =twitterTag.aggregate([
 
    {
        "$lookup":
        {
            "from":"twitterUserClean",
            "localField":"user_id",
            "foreignField":"user_id",
            "as":"user"
        }
    },
    
     {"$unwind": "$user"},
                                                
     {
         "$match":{"movie_name":"The Accountant","tag":"Boston","tag":"Senior"}
     },
    {
        "$project":{"user.user_id":1,"user.user_name":1}
    }
    ])
print(list(uc6_2))

[{'_id': ObjectId('5ad3662406651e8d9521d46c'), 'user': {'user_name': 'Wow Jobs Schweiz', 'user_id': 981237784395444000}}, {'_id': ObjectId('5ad3662406651e8d9521d46f'), 'user': {'user_name': 'jobsuche jobs', 'user_id': 981237365862789000}}]


### Alike users in YouTube

In [12]:
# First we find tags in YouTube data which are mentioned more than one time
uc6_3=youtubeData.aggregate([
    {"$match":{"movie_name":"Barbershop: The Next Cut"}},
    {"$group":{'_id':"$tag",'count':{"$sum":1}}},
    {"$match":{'count':{"$gt":1}}},
    {"$sort":{'count':-1}},
])
print(list(uc6_3))

[{'_id': ' ["Barbershop: The Next Cut FULL\'M.o.V.i.E\'2016\'HD\'"] ', 'count': 4}, {'_id': "['Wawna10418']", 'count': 2}]


In [17]:
# Then we use those tags to find channel id whose video has same tags
uc6_3=youtubeData.aggregate([
    {"$match":{"movie_name":"Barbershop: The Next Cut"}},
    {"$match":{"tag":"Barbershop: The Next Cut FULL\'M.o.V.i.E\'2016\'HD\'","tag":"['Wawna10418']"}},
    {"$project":{"channel_Id":1}},
])
print(list(uc6_3))

[{'_id': ObjectId('5ae37fca45c6ca3f0f19bf41'), 'channel_Id': 'UCVt-ttKZFa1DTX9bll4iDjw'}, {'_id': ObjectId('5ae37fca45c6ca3f0f19bf45'), 'channel_Id': 'UCgGyewH9BI3dWTRXyK6vujQ'}]


## 7. What keywords/ hashtags should I add to my post ?

In [34]:
# First, we find movies with more tags, which means this movie is a hot topic.
uc7_1 =twitterTag.aggregate([{"$group":{'_id':"$movie_name",'count':{"$sum":1}}},{"$sort":{"count":-1}},{"$limit":10}])
print(list(uc7_1))

[{'_id': '', 'count': 1046144}, {'_id': 'Boyhood', 'count': 39}, {'_id': 'A United Kingdom', 'count': 28}, {'_id': 'The Secret Scripture', 'count': 28}, {'_id': 'Bridesmaids', 'count': 27}, {'_id': 'Thor', 'count': 27}, {'_id': 'Lovesong', 'count': 22}, {'_id': 'The Intern', 'count': 21}, {'_id': 'Regression', 'count': 18}, {'_id': 'Mamma Mia!', 'count': 18}]


In [10]:
# Then we choose the hot movie and find which tags have more retweet count. The tags with more retweet count are the tag we should add to our post
uc7_2 =twitterTag.aggregate([{"$match":{"movie_name":"Boyhood"}},{"$group":{'_id':"$tag",'avgRetweet':{"$avg":"$retweet_count"}}},{"$sort":{"avgRetweet":-1}}])
print(list(uc7_2))

[{'_id': 'TrackList', 'avgRetweet': 2551.0}, {'_id': 'Ridewithme', 'avgRetweet': 2551.0}, {'_id': '20180407_DEBUT', 'avgRetweet': 2551.0}, {'_id': 'ONLYONE', 'avgRetweet': 2551.0}, {'_id': '??', 'avgRetweet': 2551.0}, {'_id': 'UNB', 'avgRetweet': 851.6666666666666}, {'_id': 'BOYHOOD', 'avgRetweet': 851.6666666666666}, {'_id': '???', 'avgRetweet': 460.5}, {'_id': 'UNB_JUN', 'avgRetweet': 69.33333333333333}, {'_id': '????', 'avgRetweet': 69.33333333333333}, {'_id': '?', 'avgRetweet': 69.33333333333333}, {'_id': 'UNB_BOYHOOD', 'avgRetweet': 69.33333333333333}, {'_id': 'JUN', 'avgRetweet': 69.33333333333333}, {'_id': 'LetsBeginUNB', 'avgRetweet': 69.33333333333333}, {'_id': '????????', 'avgRetweet': 40.0}, {'_id': 'CASPER', 'avgRetweet': 2.0}]


## 8. Who should I be following ?

In [66]:
# We find the user whose tweet contains same tags, and whose follower more than 2000 people.
# If the user have the same tags with me, which means there is a topic we both following.
# And when the user has more follower, which means he or she has a higher influence.
uc8_1 =twitterTag.aggregate([
 
    {
        "$lookup":
        {
            "from":"twitterUserClean",
            "localField":"user_id",
            "foreignField":"user_id",
            "as":"user"
        }
    },
    
     {"$unwind": "$user"},                                          
     {
         "$match":{"movie_name":"The Accountant","tag":"Boston","tag":"Senior","user.follower_count":{"$gte":2000}}
     },
    {
        "$project":{"user.user_id":1,"user.user_name":1,"user.follower_count":1}
    }
    ])
print(list(uc8_1))

[{'_id': ObjectId('5ad3662406651e8d9521d46c'), 'user': {'user_name': 'Wow Jobs Schweiz', 'user_id': 981237784395444000, 'follower_count': 2508}}]
